Revision 101
Date:
2011/05/21 11:01:04
Author:
ahitrov
Revision Log:
Модификация HTML и RSS парсера
Files:
Legend:
Added
Removed
Modified
utf8/core/lib/Contenido/Parser/HTML.pm
10
10
use Utils::HTML;
11
11
use Data::Dumper;
12
12
use utf8;
13
use Encode;
13
14
14
15
15
my @PICNAME = qw ( top menu topmenu home line dot mail razdel button find search srch delivery
16
16
head bar label phone bottom bottommenu ico icon post left right service caption arr arrow cart
17
17
basket main reply title corner address page buy pix pixel spacer fon welcome razd about back
…
…
53
53
54
54
my $content;
55
55
if ( $opts{content} ) {
56
$content = delete $opts{content};
56
$content = decode('utf-8', delete $opts{content});
57
57
delete $self->{content};
58
58
} elsif ( $self->{success} || $self->{content} ) {
59
$content = delete $self->{content};
59
$content = decode('utf-8', delete $self->{content});
60
60
} else {
61
61
$self->{success} = 0;
62
62
return $self;
…
…
66
66
my $strip_html = delete $opts{strip_html};
67
67
my $debug = $DEBUG;
68
68
my $gui = delete $opts{gui};
69
my $header = delete $opts{header};
70
my $description = delete $opts{description};
69
my $header = decode('utf-8', delete $opts{header});
70
warn "Header length: ".length($header)."\n";
71
my $description = decode('utf-8', delete $opts{description});
72
warn "Description length: ".length($description)."\n";
71
73
my $minimum = delete $opts{min} || length $description;
72
74
73
75
my $pre_rools = $self->__parse_rools (delete $opts{parser_pre});
…
…
120
122
$self->{error_message} = 'Nothing was found at all!!! Check your MINIMUM value';
121
123
return $self->is_success(0) unless $gui;
122
124
}
123
@$chosen = sort { $a->{id} <=> $b->{id} } @$chosen;
124
125
if ( $description ) {
125
126
my @use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$parse_rools if ref $parse_rools eq 'ARRAY';
126
127
$chosen = $self->__check_description ($chosen, $description, $debug) unless @use_rools;
…
…
165
166
}
166
167
}
167
168
$self->{text} = ref $chosen eq 'ARRAY' ? $chosen->[0] : $chosen;
168
$self->{html} = $content;
169
$self->{tree} = $shortcuts;
169
# $self->{html} = $content;
170
# $self->{tree} = $shortcuts;
171
$self->{tree} = $tree;
170
172
$self->{chosen} = $chosen;
171
173
} else {
172
174
$self->__post_rool ($chosen->[0], $post_rools, $description);
173
175
$self->{text} = Contenido::Parser::Util::text_cleanup($chosen->[0]->{text});
176
$self->{chosen} = $chosen;
177
map { $_->{parent} = undef } @$chosen if ref $chosen eq 'ARRAY';
174
178
$tree = undef;
175
179
foreach my $key ( keys %$shortcuts ) {
176
180
delete $shortcuts->{$key};
…
…
360
364
$last_text_tag->{type} = 'text';
361
365
$last_text_tag->{parent} = $current;
362
366
$last_text_tag->{level} = $level+1;
367
$last_text_tag->{text} = $tag->{content};
363
368
$elem_hash{$last_text_tag->{id}} = $last_text_tag;
364
369
push @{$current->{children}}, $last_text_tag;
365
370
$current->{text_count}++;
366
371
}
367
372
$current->{text_value} += $tag->{count};
368
373
splice @elems, 0, $tag->{count};
374
# warn "Tag opened. Next text: [".join('',$elems[0..10])."]\n";
369
375
} elsif ( ref $tag ) {
370
376
if ( ($current->{type} eq 'td' || $current->{type} eq 'tr' ) && $tag->{type} eq 'tr' ) {
371
377
# warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback till table begin... !!!!\n" if $debug;
…
…
391
397
$tag->{parent} = $current;
392
398
$tag->{level} = ++$level;
393
399
$elem_hash{$tag->{id}} = $tag;
400
push @{$current->{children}}, $tag;
394
401
push @stack, $current;
395
warn "Open type: $tag->{type}. ID=[$tag->{id}]. Name: ".($tag->{params}{name}||'').". Class: ".($tag->{params}{class}||'').". Level: $tag->{level}. Stack depth: ".scalar(@stack)."\n";
402
# warn "Open type: $tag->{type}. ID=[$tag->{id}]. Name: ".($tag->{params}{name}||'').". Class: ".($tag->{params}{class}||'').". Level: $tag->{level}. Stack depth: ".scalar(@stack)."\n";
396
403
$current = $tag;
397
404
splice @elems, 0, $tag->{count};
398
405
} else {
…
…
411
418
$last_text_tag->{id} = $id++;
412
419
$last_text_tag->{type} = 'text';
413
420
$last_text_tag->{parent} = $current;
421
$last_text_tag->{text} = $tag->{content};
414
422
$last_text_tag->{level} = $level+1;
415
423
$elem_hash{$last_text_tag->{id}} = $last_text_tag;
416
424
push @{$current->{children}}, $last_text_tag;
…
…
466
474
$last_text_tag->{text} .= shift @elems;
467
475
$last_text_tag->{count}++;
468
476
} else {
477
$last_text_tag = {};
469
478
$last_text_tag->{text} = shift @elems;
470
479
$last_text_tag->{count} = 1;
471
480
$last_text_tag->{id} = $id++;
…
…
490
499
my $i = 1;
491
500
my %tag;
492
501
my $tag = $content->[0];
493
while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) {
502
while ( $i < (scalar @$content - 1) && $content->[$i] ne '<' && $content->[$i] ne '>' ) {
494
503
$tag .= $content->[$i];
495
504
$i++;
496
505
}
…
…
500
509
content => $tag,
501
510
count => $i,
502
511
};
503
} else {
504
if ( $tag =~ /^<(div|table|tr|td|body|html)\s*(.*)/i ) {
505
my $val = $1;
506
if ( $tag =~ /^<($val)\s*(.*)/i ) {
507
$tag{type} = lc($1);
508
my $args = $2;
509
$tag{count} = $i+1;
510
my %args;
511
while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?"([^"]+)"/g ) {
512
$args{lc($1)} = $2;
513
}
514
while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?'([^']+)'/g ) {
515
$args{lc($1)} = $2;
516
}
517
while ( $tag =~ /([a-zA-z]+)=(\w+)/g ) {
518
$args{lc($1)} = $2;
519
}
520
foreach my $arg ( qw( name id class width align ) ) {
521
$tag{params}{$arg} = $args{$arg} if exists $args{$arg};
522
}
523
return \%tag;
524
} else {
525
return {
526
type => 'text',
527
content => $tag,
528
count => $i,
529
};
512
}
513
$tag .= $content->[$i++];
514
# warn "TAG: [$tag]\n";
515
516
if ( $tag =~ /^<(div|table|tr|td|body|html)\s*(.*)/i ) {
517
my $val = $1;
518
if ( $tag =~ /^<($val)\s*(.*)/i ) {
519
$tag{type} = lc($1);
520
my $args = $2;
521
$tag{count} = $i;
522
my %args;
523
while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?"([^"]+)"/g ) {
524
$args{lc($1)} = $2;
530
525
}
531
} else {
526
while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?'([^']+)'/g ) {
527
$args{lc($1)} = $2;
528
}
529
while ( $tag =~ /([a-zA-z]+)=(\w+)/g ) {
530
$args{lc($1)} = $2;
531
}
532
foreach my $arg ( qw( name id class width align ) ) {
533
$tag{params}{$arg} = $args{$arg} if exists $args{$arg};
534
}
535
return \%tag;
536
} else {
532
537
return {
533
538
type => 'text',
534
539
content => $tag,
535
540
count => $i,
536
541
};
537
542
}
543
} else {
544
return {
545
type => 'text',
546
content => $tag,
547
count => $i,
548
};
538
549
}
539
550
}
540
551
…
…
544
555
my $i = 2;
545
556
my %tag;
546
557
my $tag = $content->[0].$content->[1];
547
while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) {
558
while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < (scalar @$content-1) ) {
548
559
$tag .= $content->[$i];
549
560
$i++;
550
561
}
…
…
554
565
content => $tag,
555
566
count => $i,
556
567
};
557
} else {
558
if ( $tag =~ /^<\/(div|table|tr|td|body|html)/i ) {
568
}
569
$tag .= $content->[$i++];
570
# warn "TAG END: [$tag]\n";
571
if ( $tag =~ /^<\/(div|table|tr|td|body|html)/i ) {
559
572
my $val = $1;
560
if ( $tag =~ /^<\/($val)\s*?$/i ) {
573
if ( $tag =~ /^<\/($val)[\s>]/i ) {
561
574
$tag{type} = lc($1);
562
$tag{count} = $i+1;
575
$tag{count} = $i;
563
576
return \%tag;
564
577
} else {
565
578
return {
…
…
568
581
count => $i,
569
582
};
570
583
}
571
} else {
584
} else {
572
585
return {
573
586
type => 'text',
574
587
content => $tag,
575
588
count => $i,
576
589
};
577
}
578
590
}
579
591
}
580
592
…
…
583
595
my ($self, $structure, $base_url, $debug) = @_;
584
596
return unless ref $structure eq 'HASH';
585
597
586
foreach my $tag ( values %$structure ) {
587
next unless ref $tag && exists $tag->{text} && $tag->{text};
598
foreach my $tag ( grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
588
599
my $text = $tag->{text};
589
600
while ( $text =~ /<img (.*?)>/sgi ) {
590
601
# warn "Image for extract_img found [$1]. Tag ID: $tag->{id}\n";
…
…
592
603
my $img = {};
593
604
if ( $params =~ /src\x20*?=\x20*?["'](.*?)["']/ || $params =~ /src=([^\x20]+)/ ) {
594
605
$img->{url} = $1;
595
$img->{url} =~ s/[\r\t\n\x20]+$//;
596
$img->{url} =~ s/^[\r\t\n\x20]+//;
606
$img->{url} =~ s/[\r\t\n\ ]+$//;
607
$img->{url} =~ s/^[\r\t\n\ ]+//;
597
608
$img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/;
598
609
$img->{url} =~ s/\/+/\//sgi;
599
610
$img->{url} =~ s/http:\//http:\/\//sgi;
…
…
616
627
my ($self, $structure, $debug) = @_;
617
628
return unless ref $structure eq 'HASH';
618
629
619
foreach my $tag ( values %$structure ) {
620
next unless ref $tag && exists $tag->{text} && $tag->{text};
630
foreach my $tag ( grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
621
631
my $text = $tag->{text};
622
632
while ( $text =~ /<h([\d])[^>]*?>([^<]+)<\/h[\d]>/sgi ) {
623
633
my $header_level = $1;
…
…
648
658
}
649
659
650
660
my @ret;
651
foreach my $tid ( sort { $a <=> $b } keys %$structure ) {
652
my $tag = $structure->{$tid};
653
next unless ref $tag && exists $tag->{text} && $tag->{text};
654
next if $self->__exclude_rools($tag, \@exclude_rools);
661
foreach my $tag ( sort { $a->{id} <=> $b->{id} } grep { ref $_ && $_->{type} eq 'text' && $_->{text} } values %$structure ) {
662
next if $self->__exclude_rools($tag->{parent}, \@exclude_rools);
655
663
656
664
if ( @rools ) {
657
665
my $choose = 0;
658
666
foreach my $rool ( @rools ) {
659
667
my $matched = 1;
660
668
foreach my $cond ( @{$rool->{condition}} ) {
661
unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) {
669
unless ( exists $tag->{parent}{params}{$cond->{param}} && $tag->{parent}{params}{$cond->{param}} eq $cond->{value} ) {
662
670
$matched = 0;
663
671
}
664
672
}
665
673
$choose ||= $matched;
666
674
}
667
675
if ( $choose ) {
668
$tag->{text} =~ s/^[\t\ \n\r]+//s;
669
$tag->{text} =~ s/[\t\ \n\r]+$//s;
670
$tag->{text} =~ s/[\t\ ]+/\ /sg;
671
$tag->{text} =~ s/\r//sg;
672
$tag->{text} =~ s/\n{2,}/\n\n/sg;
673
$tag->{text} =~ s/<a.*?>//sgi;
674
$tag->{text} =~ s/<\/a.*?>//sgi;
675
$tag->{text} =~ s/\&\\x(\d+)//sgi;
676
for ( $tag->{text} ) {
677
s/^[\t\ \n\r]+//s;
678
s/[\t\ \n\r]+$//s;
679
s/[\t\ ]+/\ /sg;
680
s/\r//sg;
681
s/\n{2,}/\n\n/sg;
682
s/\&\\x(\d+)//sgi;
683
}
676
684
677
685
my $text = $tag->{text};
678
686
$text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi;
679
# $text = Contenido::Parser::Util::strip_html($text);
687
$text = Contenido::Parser::Util::strip_html($text);
688
$tag->{text_weight} = length($text);
680
689
if ( length($text) >= $minimum ) {
681
push @ret, $tag;
690
for ( $tag->{text} ) {
691
s/<a.*?>//sgi;
692
s/<\/a.*?>//sgi;
693
}
694
push @ret, $tag;
682
695
}
683
696
}
684
697
} else {
698
for ( $tag->{text} ) {
699
s/^[\t\ \n\r]+//s;
700
s/[\t\ \n\r]+$//s;
701
s/[\t\ ]+/\ /sg;
702
s/\r//sg;
703
s/\n{2,}/\n\n/sg;
704
}
685
705
my $text = $tag->{text};
686
$text =~ s/^[\t\ \n\r]+//s;
687
$text =~ s/[\t\ \n\r]+$//s;
688
$text =~ s/[\t\ ]+/\ /sg;
689
$text =~ s/\r//sg;
690
$text =~ s/\n{2,}/\n\n/sg;
691
# It does wrong job:
692
# $text =~ s/([,!\?])(\S)/$1\ $2/sg;
693
$tag->{text} = $text;
694
706
$text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi;
695
# $text = Contenido::Parser::Util::strip_html($text);
707
$text = Contenido::Parser::Util::strip_html($text);
708
$tag->{text_weight} = length($text);
696
709
if ( length($text) >= $minimum ) {
697
$tag->{text} =~ s/<a.*?>//sgi;
698
$tag->{text} =~ s/<\/a.*?>//sgi;
699
$tag->{text} =~ s/\&\\x(\d+)//sgi;
710
for ( $tag->{text} ) {
711
s/<a.*?>//sgi;
712
s/<\/a.*?>//sgi;
713
s/\&\\x(\d+)//sgi;
714
}
700
715
push @ret, $tag;
701
716
}
702
717
}
…
…
836
851
# $unit->{text} = HTML::Entities::decode_entities($unit->{text});
837
852
# $unit->{text} = Contenido::Parser::Util::strip_html($unit->{text});
838
853
for ( $unit->{text} ) {
839
s/^[\x20\t\r\n]+//si;
854
s/^[\ \t\r\n]+//si;
840
855
s/^(\d+)\.(\d+)\.(\d+)//si;
841
s/^[\x20\t\r\n]+//si;
856
s/^[\ \t\r\n]+//si;
842
857
s/^(\d+):(\d+)//si;
843
s/^[\x20\t\r\n]+//si;
858
s/^[\ \t\r\n]+//si;
844
859
}
845
860
if ( lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) {
846
861
substr $unit->{text}, 0, length($header), '';
847
$unit->{text} =~ s/^[\.\x20\t\r\n]+//sgi;
862
$unit->{text} =~ s/^[\.\ \t\r\n]+//sgi;
848
863
}
849
$unit->{text} =~ s/[\x20\t\r\n]+$//sgi;
864
$unit->{text} =~ s/[\ \t\r\n]+$//sgi;
850
865
}
851
866
}
852
867
…
…
928
943
push @images, @img;
929
944
}
930
945
} else {
931
next if ($tag->{level}+1) < $chosen->{level};
932
next if $image_depth && ( $tag->{level} > ($chosen->{level} + $image_depth) );
946
next if ($tag->{level}+1) < $chosen->{parent}{level};
947
next if $image_depth && ( $tag->{level} > ($chosen->{parent}{level} + $image_depth) );
933
948
934
949
my $ok = 0;
935
my $uphops = $tag->{level} > $chosen->{level} ? 1 : 2;
936
my $hops = $image_depth ? $image_depth : $tag->{level} - $chosen->{level} + $uphops;
950
my $uphops = $tag->{level} > $chosen->{parent}{level} ? 1 : 2;
951
my $hops = $image_depth ? $image_depth : $tag->{level} - $chosen->{parent}{level} + $uphops;
937
952
next if ($hops - $uphops) > 4;
938
953
my @img_parents = ($tag->{id});
939
954
my $parent = $tag;
…
…
941
956
$parent = $parent->{parent};
942
957
push @img_parents, $parent->{id};
943
958
}
944
$parent = $chosen;;
959
$parent = $chosen->{parent}{parent};
945
960
for ( 0..$uphops ) {
946
961
if ( grep { $parent->{id} == $_ } @img_parents ) {
947
962
$ok = 1;
…
…
951
966
}
952
967
if ( $ok ) {
953
968
my @img = grep { $self->__img_is_valid ($_) } map {
954
my $img = rchannel::Image->new($_);
955
$img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/;
969
my $img = $_;
970
$img->{src} = $base_url.($img->{src} =~ m|^/| ? '' : '/').$img->{src} unless $img->{src} =~ /^http:/;
956
971
$img;
957
972
} map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} };
958
973
…
…
972
987
sub __img_is_valid {
973
988
my ($self, $img) = @_;
974
989
975
# return 1;
990
return 1;
976
991
if ( $img->check_online ) {
977
992
my $delim = 0;
978
993
my $w = $img->width;
utf8/core/lib/Contenido/Parser/Util.pm
26
26
return join "\n\n", grep length $_, @paragfs;
27
27
}
28
28
29
sub strip_html {
30
my $text = shift;
31
32
if ( ref $text ) {
33
for ( $$text ) {
34
s/<\/?[^>]+>//sgi;
35
}
36
} else {
37
for ( $text ) {
38
s/<\/?[^>]+>//sgi;
39
}
40
return $text;
41
}
42
}
43
29
44
1;
Небольшая справка по веткам
cnddist – контейнер, в котором хранятся все дистрибутивы всех библиотек и программных пакетов, которые использовались при построении различных версий Contenido. Если какой-то библиотеки в данном хранилище нет, инсталлятор сделает попытку "подтянуть" ее с веба (например, с CPAN). Если библиотека слишком старая, есть очень большая вероятность, что ее там уже нет. Поэтому мы храним весь хлам от всех сборок. Если какой-то дистрибутив вдруг отсутствует в cnddist - напишите нам, мы положим его туда.
koi8 – отмирающая ветка, чей код, выдача и все внутренние библиотеки заточены на кодировку KOI8-R. Вносятся только те дополнения, которые касаются внешнего вида и функционала админки, баги ядра, обязательные обновления портов и мелочи, которые легко скопипастить. В дальнейшем планируется полная остановка поддержки по данной ветке.
utf8 – актуальная ветка, заточенная под UTF-8.
Внутри каждой ветки: core – исходники ядра; install – скрипт установки инсталляции; plugins – плагины; samples – "готовые к употреблению" проекты, которые можно поставить, запустить и посмотреть, как они работают.