Contenido Project

utf8/core/lib/Contenido/Parser.pm


54	54	$content = $res->decoded_content( charset => 'none' );
55	55	# warn "Charset: ".$res->content_charset."\n";
56	56	} else {
57		warn $res->status_line." \n";
	57	warn $res->status_line." \n" if $DEBUG;
58	58	$self->{success} = 0;
59	59	$self->{reason} = $res->status_line;
60	60	return $self;
…	…
207	207	return \@tags;
208	208	}
209	209
	210	### Имеет дело с "ободранным" тегом,
	211	# в котором отстутсвуют < и >
	212	########################################
	213	sub parse_html_tag {
	214	my $self = shift;
	215	my $tagstr = shift;
	216
	217	my %struct;
	218	for ( $tagstr ) {
	219	s/\ =\ /=/g;
	220	$_ = __encode_quotes($_);
	221	}
	222	my @tag = split /\ +/, $tagstr;
	223	$struct{tag} = lc(shift @tag);
	224
	225	foreach my $str ( @tag ) {
	226	if ( $str =~ /^(.?)=(.)$/ ) {
	227	my $attr = lc($1);
	228	my $val = $2;
	229	for ( $val ) {
	230	s/^"//;
	231	s/"$//;
	232	s/ /\ /sg;
	233	}
	234	$struct{$attr} = $val;
	235	}
	236	}
	237	return \%struct;
	238	}
	239
	240
210	241	sub __encode_quotes {
211	242	my $str = shift;
212	243	my @in = split //, $str;
…	…
230	261	return $out;
231	262	}
232	263
	264
	265	sub image_replace {
	266	my ($self, $img_params, $replace_struct) = @_;
	267
	268	my $img = $self->parse_html_tag('img '.$img_params);
	269	if ( exists $replace_struct->{$img->{src}} ) {
	270	my $new_image = $replace_struct->{$img->{src}};
	271	if ( ref $new_image && exists $new_image->{filename} ) {
	272	$img->{src} = $new_image->{filename};
	273	} else {
	274	$img->{src} = $new_image;
	275	}
	276	return '<img '.join(' ', map { $_.'="'.$img->{$_}.'"' } grep { $_ ne 'tag' } keys %$img).'>';
	277	} else {
	278	return '';
	279	}
	280	}
	281
233	282	sub scheme {
234	283	my $uri = shift;
235	284	my $scheme;

utf8/core/lib/Contenido/Parser/HTML.pm


110	110	warn "Make tree...\n" if $debug;
111	111	my ($tree, $shortcuts) = $self->__make_tree (\$content, $parse_rools, $debug);
112	112
113		$self->__extract_img ($shortcuts, $base_url, $debug);
	113	$self->__extract_img ($shortcuts, $base_url, $strip_html, $debug);
114	114	$self->__extract_headers ($shortcuts, $header, $debug);
115	115	warn "Getting big texts (min=$minimum)...\n" if $debug;
116	116	my $chosen = $self->__dig_big_texts (
…	…
141	141	chosen => $chosen,
142	142	header => $header,
143	143	ref $post_rools eq 'ARRAY' && @$post_rools ? (rools => $post_rools) : (),
144		debug => $debug
	144	debug => $debug,
	145	strip_html => $strip_html,
145	146	);
146	147	if ( ref $parse_rools eq 'ARRAY' ) {
147	148	my ($glue) = grep { $_->{command} eq 'glue' } @$parse_rools;
…	…
595	596
596	597
597	598	sub __extract_img {
598		my ($self, $structure, $base_url, $debug) = @_;
	599	my ($self, $structure, $base_url, $strip_html, $debug) = @_;
599	600	return unless ref $structure eq 'HASH';
600	601
601	602	foreach my $tag ( grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
602		my $text = $tag->{text};
603		while ( $text =~ /<img (.*?)>/sgi ) {
	603	while ( $tag->{text} =~ /<img (.*?)\/?>/sgi ) {
604	604	# warn "Image for extract_img found [$1]. Tag ID: $tag->{id}\n";
605	605	my $params = $1;
606		my $img = {};
607		if ( $params =~ /src\x20?=\x20?["'](.*?)["']/ \|\| $params =~ /src=([^\x20]+)/ ) {
608		$img->{url} = $1;
609		$img->{url} =~ s/[\r\t\n\ ]+$//;
610		$img->{url} =~ s/^[\r\t\n\ ]+//;
611		$img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/;
612		$img->{url} =~ s/\/+/\//sgi;
613		$img->{url} =~ s/http:\//http:\/\//sgi;
614		$img->{w} = $1 if $params =~ /width[\D]+(\d+)/;
615		$img->{h} = $1 if $params =~ /height[\D]+(\d+)/;
616		$img->{alt} = $1 if $params =~ /alt\x20?=\x20?["'](.*?)["']/;
	606	my $img = $self->parse_html_tag('img '.$params);
	607	if ( exists $img->{src} && $img->{src} ) {
	608	my %img = ( src => $img->{src} );
	609	$img{url} = $img{src} =~ /^http[s]?:/ ? $img{src} : $base_url.($img{src} =~ m\|^/\| ? '' : '/').$img{src};
	610	$img{w} = $img->{width} if $img->{width};
	611	$img{h} = $img->{height} if $img->{height};
	612	$img{alt} = $img->{alt} if $img->{alt};
	613	$img{title} = $img->{title} if $img->{title};
617	614	$tag->{images} = [] unless ref $tag->{images} eq 'ARRAY';
618		push @{ $tag->{images} }, $img;
	615	push @{ $tag->{images} }, \%img;
	616	}
	617	# if ( $params =~ /src\x20?=\x20?["'](.*?)["']/ \|\| $params =~ /src=([^\x20]+)/ ) {
	618	# $img->{url} = $1;
	619	# $img->{url} =~ s/[\r\t\n\ ]+$//;
	620	# $img->{url} =~ s/^[\r\t\n\ ]+//;
	621	# $img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/;
	622	# $img->{url} =~ s/\/+/\//sgi;
	623	# $img->{url} =~ s/http:\//http:\/\//sgi;
	624	# $img->{w} = $1 if $params =~ /width[\D]+(\d+)/;
	625	# $img->{h} = $1 if $params =~ /height[\D]+(\d+)/;
	626	# $img->{alt} = $1 if $params =~ /alt\x20?=\x20?["'](.*?)["']/;
	627	# $tag->{images} = [] unless ref $tag->{images} eq 'ARRAY';
	628	# push @{ $tag->{images} }, $img;
619	629	# warn "Image for extract_img stored [$img->{url}]. Tag ID: $tag->{id}\n";
620		}
	630	# }
621	631	}
622		$text =~ s/<img (.*?)>//sgi;
623		$tag->{text} = $text;
624		$tag->{count} = length ($text);
	632	$tag->{text} =~ s/<img (.*?)>//sgi if $strip_html;
	633	$tag->{count} = length ($tag->{text});
625	634	}
626	635	}
627	636
…	…
716	725	s/\&\\x(\d+)//sgi;
717	726	}
718	727	push @ret, $tag;
	728	# $self->log_elem($tag);
719	729	}
720	730	}
721	731	}
…	…
815	825
816	826	my $chosen = $opts{chosen};
817	827	my $rooles = $opts{rools};
818		my $header = $opts{header};
	828	my $header = $opts{header} \|\| '';
	829	my $strip_html = $opts{strip_html};
819	830
820	831	foreach my $unit ( @$chosen ) {
821	832	my %tags;
…	…
861	872	s/^(\d+):(\d+)//si;
862	873	s/^[\ \t\r\n]+//si;
863	874	}
864		if ( lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) {
	875	if ( $header && lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) {
865	876	substr $unit->{text}, 0, length($header), '';
866	877	$unit->{text} =~ s/^[\.\ \t\r\n]+//sgi;
867	878	}
…	…
1195	1206	}
1196	1207
1197	1208
	1209	sub log_elem {
	1210	my $self = shift;
	1211	my $elem = shift;
	1212	return unless ref $elem eq 'HASH';
	1213
	1214	my %elem;
	1215	map { $elem{$_} = $elem->{$_} } grep { $_ ne 'parent' } keys %$elem;
	1216	warn Dumper \%elem;
	1217	}
	1218
	1219
1198	1220	1;

utf8/core/ports/all/libwww/GNUmakefile


5	5
6	6	#include ../../etc/perl.mk
7	7
8		PORTVERSION = 5.805
	8	PORTVERSION = 5.836
9	9	DISTFILE = ${PORTNAME}-perl-${PORTVERSION}.tar.gz
10	10	PERL_MAKEMAKER = yes
11	11	MASTER_CPAN_SUBDIR = LWP

Repository List / Contenido @ r146

Revision Navigation

Revision 146

Legend:

utf8/core/lib/Contenido/Parser.pm

utf8/core/lib/Contenido/Parser/HTML.pm

utf8/core/ports/all/libwww/GNUmakefile

Небольшая справка по веткам