Revision 98
- Date:
- 2011/04/27 15:28:25
- Files:
Legend:
- Added
- Removed
- Modified
-
utf8/core/lib/Contenido/Parser/RSS.pm
1 package Contenido::Parser::RSS; 2 3 use strict; 4 use warnings; 5 use locale; 6 7 use Contenido::Parser::Util; 8 use Utils::HTML; 9 #use Time::ParseDate; 10 #use Date::Parse; 11 use Data::Dumper; 12 use Digest::MD5 qw(md5_hex); 13 #use Class::Date; 14 use Encode; 15 use utf8; 16 17 my @INVALID_TAGS = qw ( A ABBREV ACRONYM ADDRESS APP APPLET AREA AU B BANNER BASE BASEFONT BDO BGSOUND BIG BLINK BLOCKQUOTE 18 BODY BQ BR CAPTION CENTER CITE CODE COL COLGROUP CREDIT DD DEL DFN DIR DIV DL DT EM FN FIG FONT FORM FRAME FRAMESET 19 H1 H2 H3 H4 H5 H6 HP HR I IMG INPUT INS ISINDEX KBD LANG LH LI LISTING MAP MARQUEE MENU META NEXTID NOBR NOEMBED 20 NOFRAMES NOTE OL OPTION OVERLAY P PARAM PERSON PLAINTEXT PRE Q S SAMP SELECT SMALL SPAN STRIKE STRONG SUB SUP TAB 21 TABLE TBODY TD TEXTAREA TFOOT TH THEAD TR TT U UL VAR WBR XMP EMBED 22 ); 23 24 sub new { 25 my ($proto) = @_; 26 my $class = ref($proto) || $proto; 27 my $self = {}; 28 bless $self, $class; 29 30 return $self; 31 } 32 33 sub parse { 34 my ($self, %opts) = @_; 35 36 my $content = delete $opts{content}; 37 my $base_url = delete $opts{base_url}; 38 my $strip_html = delete $opts{strip_html}; 39 my $allow_global_fulltext = delete $opts{allow_fulltext} || 0; 40 my $content_global_type = delete $opts{content_type} || 1; 41 my $debug = delete $opts{debug}; 42 my $gui = delete $opts{gui}; 43 my $description_as_fulltext = delete $opts{description_as_fulltext}; 44 warn "Parser Rools: [".$opts{parser_rss}."]\n" if $debug; 45 46 my $rss_rools = $self->__parse_rools (delete $opts{parser_rss}); 47 48 warn "RSS Rools: ".Dumper ($rss_rools) if $debug; 49 50 my @items; 51 my $feed = $self->__parse_content(\$content); 52 53 if ( ref $feed eq 'ARRAY' ) { 54 foreach my $item ( @$feed ) { 55 my $fulltext_field; 56 my $content_type = $content_global_type; 57 my $allow_fulltext = $allow_global_fulltext; 58 $self->__check_rewrite ( item => $item, rools => $rss_rools ); 59 my $date = Time::ParseDate::parsedate($item->{pubdate}); 60 my $pubdate = Class::Date::localdate(Date::Parse::str2time($item->{pubdate})); 61 next if ref $item->{title}; 62 next if ref $item->{description}; 63 $self->__check_ignore ( item => $item, rools => $rss_rools ); 64 $self->__check_only ( item => $item, rools => $rss_rools ); 65 $item->{title} = $self->__field_prepare ($item->{title}); 66 $self->__check_filter ( gui => $gui, field => 'title', item => $item, rools => $rss_rools ); 67 my $title = $item->{title}; 68 my $link; 69 if ( ref $item->{link} eq 'HASH' ) { 70 if ( ( (exists $item->{link}{type} && $item->{link}{type} eq 'text/html') || !exists $item->{link}{type} ) && exists $item->{link}{href} ) { 71 $link = $item->{link}{href}; 72 } 73 } elsif ( ref $item->{link} eq 'ARRAY' ) { 74 foreach my $lnk ( @{ $item->{link} } ) { 75 if ( ref $lnk ) { 76 if ( ( (exists $lnk->{type} && $lnk->{type} eq 'text/html') || !exists $lnk->{type} ) && exists $lnk->{href} ) { 77 $link = $lnk->{href}; 78 } 79 } else { 80 $link = $lnk; 81 last; 82 } 83 } 84 } else { 85 $link = $item->{'link'} || (ref $item->{'url'} eq 'ARRAY' ? $item->{'url'}->[0] : $item->{'url'}); 86 } 87 $link = $self->__field_prepare ($link); 88 $link = $base_url.($link =~ m|^/| ? '' : '/' ).$link if $base_url && ($link !~ /^http:/); 89 $item->{description} = $self->__field_prepare ($item->{description}); 90 $self->__check_filter ( gui => $gui, field => 'description', item => $item, rools => $rss_rools ); 91 my $description = $item->{description}; 92 if ( exists $item->{'rambler:fulltext'} && $item->{'rambler:fulltext'} ) { 93 $allow_fulltext = 1; 94 } 95 my $fulltext; 96 if ( $description_as_fulltext ) { 97 $fulltext = $description; 98 $fulltext_field = 'description' 99 } else { 100 if ( $gui ) { 101 foreach my $field ( qw( rambler:fulltext rambler:full-text yandex:full-text mailru:full-text content:encoded full-text fulltext text ) ) { 102 if ( exists $item->{$field} && $item->{$field} ) { 103 $fulltext_field = $field; 104 $fulltext = $item->{$field}; 105 last; 106 } 107 } 108 } else { 109 $fulltext = 110 $item->{'rambler:fulltext'} || 111 $item->{'rambler:full-text'} || 112 $item->{'yandex:full-text'} || 113 $item->{'mailru:full-text'} || 114 $item->{'content:encoded'} || 115 $item->{'full-text'} || 116 $item->{'fulltext'} || 117 $item->{'text'}; 118 } 119 if ( ref $fulltext eq 'HASH') { 120 my @values = values %$fulltext; 121 if ( scalar @values == 1 ) { 122 $fulltext = $values[0]; 123 } 124 } 125 if ( ref $fulltext eq 'ARRAY' ) { 126 $fulltext = join "\n", @$fulltext; 127 } 128 $self->__check_filter ( gui => $gui, field => 'fulltext', item => $item, text => \$fulltext, rools => $rss_rools ); 129 $fulltext = $self->__field_prepare ($fulltext); 130 } 131 if ( $fulltext && !$description ) { 132 $item->{description} = Utils::HTML::limit_words ( $fulltext, 150, 300 ); 133 $self->__check_filter ( gui => $gui, field => 'description', item => $item, rools => $rss_rools ); 134 $description = $item->{description}; 135 } 136 $allow_fulltext = 0 unless $fulltext; 137 my $author; 138 if ( exists $item->{author} && $item->{author} ) { 139 if ( ref $item->{author} eq 'HASH' && exists $item->{author}{name} ) { 140 $author = $item->{author}{name}; 141 } elsif ( !ref $item->{author} ) { 142 $author = $item->{author}; 143 } 144 } 145 my $category = []; 146 if ( exists $item->{category} && ref $item->{category} eq 'ARRAY' ) { 147 $category = $item->{category}; 148 } elsif ( exists $item->{category} ) { 149 $category = [$item->{category}]; 150 } 151 my @images; 152 if ( exists $item->{image} || exists $item->{enclosure} ) { 153 my @src = ref $item->{image} eq 'ARRAY' ? @{ $item->{image} } : ( $item->{image} ) if exists $item->{image}; 154 my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure}; 155 @att = grep { ref $_ eq 'HASH' && $_->{type} =~ /image/ } @att; 156 @images = map { 157 my $img = rchannel::Image->new($_); 158 $img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/; $img; 159 } map { {src => $_->{url}, $_->{width} ? (width => $_->{width}) : (), $_->{height} ? (height => $_->{height}) : (), $_->{title} ? (title => $_->{title}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att; 160 } 161 my @videos; 162 if ( exists $item->{video} || exists $item->{enclosure} ) { 163 my @src = ref $item->{video} eq 'ARRAY' ? @{ $item->{video} } : ( $item->{video} ) if exists $item->{video}; 164 my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure}; 165 @att = grep { ref $_ eq 'HASH' && $_->{type} =~ /video/ } @att; 166 @videos = map { {src => $_->{url}, $_->{type} ? (type => $_->{type}) : (), $_->{title} ? (title => $_->{title}) : (), $_->{width} ? (width => $_->{width}) : (), $_->{height} ? (height => $_->{height}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att; 167 } 168 my @audios; 169 if ( exists $item->{audio} || exists $item->{enclosure} ) { 170 my @src = ref $item->{audio} eq 'ARRAY' ? @{ $item->{audio} } : ( $item->{audio} ) if exists $item->{audio}; 171 my @att = ref $item->{enclosure} eq 'ARRAY' ? @{ $item->{enclosure} } : ( $item->{enclosure} ) if exists $item->{enclosure}; 172 @att = grep { ref $_ eq 'HASH' && $_->{type} =~ /audio/ } @att; 173 @audios = map { {src => $_->{url}, $_->{type} ? (type => $_->{type}) : (), $_->{title} ? (title => $_->{title}) : ()} } grep { ref $_ eq 'HASH' && exists $_->{url} } @src, @att; 174 } 175 my ($video_url, $audio_url); 176 if ( $content_type == 2 || @videos || exists $item->{'videourl'} || exists $item->{'video_url'} ) { 177 $video_url = exists $item->{video} && ref $item->{video} eq 'HASH' && exists $item->{video}{url} ? 178 $item->{video}{url} : 179 $item->{'videourl'} || 180 $item->{'video_url'} || 181 ($item->{'guid'} =~ /^http:/ ? $item->{'guid'} : undef) || 182 (exists $item->{'link'} && ref $item->{'link'} eq 'HASH' ? $item->{'link'}{'href'} || $item->{'link'}{'url'} : $item->{'link'} ); 183 $content_type = 2; 184 } 185 if ( @audios || exists $item->{'audiourl'} || exists $item->{'audio_url'} ) { 186 $audio_url = exists $item->{audio} && ref $item->{audio} eq 'HASH' && exists $item->{audio}{url} ? 187 $item->{audio}{url} : 188 $item->{'audiourl'} || $item->{'audio_url'}; 189 $content_type = 2; 190 } 191 my $related = []; 192 if ( exists $item->{'rambler:related'} && $item->{'rambler:related'} ) { 193 if ( ref $item->{'rambler:related'} eq 'ARRAY' ) { 194 foreach my $relitem ( @{ $item->{'rambler:related'} } ) { 195 my $rel = $self->__parse_related ( $relitem ); 196 push @$related, $rel if ref $rel; 197 } 198 } elsif ( ref $item->{'rambler:related'} eq 'HASH' ) { 199 my $rel = $self->__parse_related ( $item->{'rambler:related'} ); 200 push @$related, $rel if ref $rel; 201 } 202 } 203 @videos = grep { exists $_->{type} && lc($_->{type}) eq 'video/x-flv' && $_->{src} =~ /\.flv$/i } @videos; 204 push @items, { 205 'checksum' => md5_hex(encode_utf8($title.$description)), 206 'ignore' => $item->{ignore} || 0, 207 'title' => $title || '', 208 'title_gui' => $item->{title_gui} || $title || '', 209 'description' => $description || '', 210 'description_gui' => $item->{description_gui} || $description || '', 211 'desc_length' => length( $description || '' ), 212 'link' => $link || '', 213 'pubdate' => $pubdate || '', 214 'fulltext' => $fulltext || '', 215 'fulltext_gui' => $item->{fulltext_gui} || '', 216 'fulltext_field' => $fulltext_field || '', 217 'image' => @images ? $images[0] : undef, 218 'images' => @images ? \@images : undef, 219 'video' => @videos ? $videos[0] : undef, 220 'videos' => @videos ? \@videos : undef, 221 'categories' => $category, 222 'video_url' => $video_url || '', 223 'audio_url' => $audio_url || '', 224 'author' => $author || '', 225 'related' => $related, 226 'content_type' => $content_type, 227 'allow_fulltext' => $allow_fulltext, 228 }; 229 } 230 } else { 231 $self->error_message($@ || 'Something wrong while parsing content'); 232 return $self->is_success(0); 233 } 234 235 $self->items(\@items); 236 return $self->is_success(1); 237 } 238 239 240 sub __check_rewrite { 241 my ($self, %opts) = @_; 242 my $item = $opts{item}; 243 return unless ref $item; 244 return unless ref $opts{rools} eq 'ARRAY'; 245 my @rools = grep { $_->{action} eq 'rewrite' } @{ $opts{rools} }; 246 return unless @rools; 247 foreach my $rool ( @rools ) { 248 my $field = $rool->{target}; 249 my $value = $rool->{condition}; 250 if ( $value eq 'CURRENT_DATETIME' ) { 251 my $dt = DateTime->now( time_zone => "Europe/Moscow" ); 252 $value = $dt->ymd('-').'T'.$dt->hms.' MSK'; 253 } 254 $item->{$field} = $value if exists $item->{$field}; 255 } 256 } 257 258 259 sub __check_filter { 260 my ($self, %opts) = @_; 261 my $field = $opts{field}; 262 my $gui = $opts{gui}; 263 my $item = $opts{item}; 264 my $text = exists $opts{text} ? $opts{text} : undef; 265 return unless ref $item; 266 return unless exists $opts{text} || exists $item->{$field}; 267 return unless ref $opts{rools} eq 'ARRAY'; 268 my @rools = grep { $_->{action} eq 'filter' && $_->{target} eq $field } @{ $opts{rools} }; 269 return unless @rools; 270 foreach my $rool ( @rools ) { 271 if ( $rool->{command} eq 'cut' ) { 272 my $condition = $rool->{condition}; 273 if ( $rool->{subcommand} eq 'off' ) { 274 if ( $opts{gui} ) { 275 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field}); 276 $field_gui =~ s/($condition)/<b style="color:red">$1<\/b>/sgi; 277 $item->{$field."_gui"} = $field_gui; 278 } 279 if ( exists $opts{text} ) { 280 $$text =~ s/$condition//sgi; 281 } else { 282 $item->{$field} =~ s/$condition//sgi; 283 } 284 } elsif ( $rool->{subcommand} eq 'from' ) { 285 if ( $gui ) { 286 my $cut_text = ''; 287 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field}); 288 my $pos = index $field_gui, $condition; 289 if ( $pos >= 0 ) { 290 $cut_text = substr $field_gui, $pos, -1; 291 $field_gui = substr $field_gui, 0, $pos; 292 } 293 # $field_gui =~ s/($condition)(.*)$/<b style="color:red">$1$2<\/b>/si; 294 $item->{$field."_gui"} = $field_gui.'<b style="color:red">'.$cut_text.'</b>'; 295 } 296 if ( exists $opts{text} ) { 297 my $pos = index $$text, $condition; 298 if ( $pos >= 0 ) { 299 $$text = substr $$text, 0, $pos; 300 } 301 } else { 302 my $pos = index $item->{$field}, $condition; 303 if ( $pos >= 0 ) { 304 $item->{$field} = substr $item->{$field}, 0, $pos; 305 } 306 } 307 } elsif ( $rool->{subcommand} eq 'till' ) { 308 if ( $opts{gui} ) { 309 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field}); 310 $field_gui =~ s/^(.*?)($condition)/<b style="color:red">$1$2<\/b>/si; 311 $item->{$field."_gui"} = $field_gui; 312 } 313 if ( exists $opts{text} ) { 314 $$text =~ s/^(.*?)($condition)//si; 315 } else { 316 $item->{$field} =~ s/^(.*?)($condition)//si; 317 } 318 } elsif ( $rool->{subcommand} eq 'untill' ) { 319 if ( $opts{gui} ) { 320 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field}); 321 $field_gui =~ s/^(.*?)($condition)/<b style="color:red">$1<\/b>$2/si; 322 $item->{$field."_gui"} = $field_gui; 323 } 324 if ( exists $opts{text} ) { 325 $$text =~ s/^(.*?)($condition)/$2/si; 326 } else { 327 $item->{$field} =~ s/^(.*?)($condition)/$2/si; 328 } 329 } elsif ( $rool->{subcommand} eq 'regex' ) { 330 if ( $opts{gui} ) { 331 my $field_gui = $item->{$field."_gui"} || (exists $opts{text} ? $$text : $item->{$field}); 332 if ( substr($condition,0,1) eq '^' ) { 333 my $cond = reverse($condition); 334 chop($cond); 335 $cond = reverse($cond); 336 $field_gui =~ s/^($cond)/<b style="color:red">$1<\/b>/si; 337 } elsif ( substr($condition,-1,1) eq '$' ) { 338 my $cond = $condition; 339 chop($cond); 340 $field_gui =~ s/($cond)$/<b style="color:red">$1<\/b>/si; 341 } else { 342 $field_gui =~ s/($condition)/<b style="color:red">$1<\/b>/sgi; 343 } 344 $item->{$field."_gui"} = $field_gui; 345 } 346 if ( exists $opts{text} ) { 347 $$text =~ s/$condition//sgi; 348 } else { 349 $item->{$field} =~ s/$condition//sgi; 350 } 351 } 352 } elsif ( $rool->{command} eq 'regex' ) { 353 my $from = $rool->{condition}{from}; 354 my $to = $rool->{condition}{to}; 355 if ( exists $opts{text} ) { 356 eval ("\$\$text =~ s/$from/$to/sgi"); 357 } else { 358 eval ("\$item->{\$field} =~ s/$from/$to/sgi"); 359 } 360 } 361 } 362 } 363 364 365 sub __check_ignore { 366 my ($self, %opts) = @_; 367 my $item = $opts{item}; 368 return unless ref $item; 369 return unless ref $opts{rools} eq 'ARRAY'; 370 my @rools = grep { $_->{action} eq 'ignore' } @{ $opts{rools} }; 371 return unless @rools; 372 foreach my $rool ( @rools ) { 373 my $target = $rool->{target}; 374 if ( $rool->{command} =~ /^contain/ ) { 375 $item->{ignore} = 1 if index (lc($item->{$target}), lc($rool->{condition})) >= 0; 376 } 377 if ( $rool->{command} eq '=' ) { 378 $item->{ignore} = 1 if lc($item->{$target}) eq lc($rool->{condition}); 379 } 380 if ( $rool->{command} eq 'regex' ) { 381 my $regex = $rool->{condition}; 382 $item->{ignore} = 1 if $item->{$target} =~ /$regex/sgi; 383 } 384 } 385 } 386 387 388 sub __check_only { 389 my ($self, %opts) = @_; 390 my $item = $opts{item}; 391 return unless ref $item; 392 return unless ref $opts{rools} eq 'ARRAY'; 393 my @rools = grep { $_->{action} eq 'only' } @{ $opts{rools} }; 394 return unless @rools; 395 foreach my $rool ( @rools ) { 396 my $target = $rool->{target}; 397 if ( $rool->{command} =~ /^contain/ ) { 398 $item->{ignore} = 1 unless index (lc($item->{$target}), lc($rool->{condition})) >= 0; 399 } 400 if ( $rool->{command} eq '=' ) { 401 $item->{ignore} = 1 unless lc($item->{$target}) eq lc($rool->{condition}); 402 } 403 if ( $rool->{command} eq 'regex' ) { 404 my $regex = $rool->{condition}; 405 $item->{ignore} = 1 unless $item->{$target} =~ /$regex/sgi; 406 } 407 } 408 } 409 410 411 sub __feed_type { 412 my ($self, $contref) = @_; 413 414 my $type; 415 if ( $$contref =~ /<rss([^>]*)>/ ) { 416 $type = 'RSS'; 417 } elsif ( $$contref =~ /<feed\s+([^>]*)>/ ) { 418 my $feed_params = $1; 419 my $params = $self->__parse_params ($feed_params); 420 if ( exists $params->{xmlns} && $params->{xmlns} =~ /purl.org\/atom/ ) { 421 $type = 'ATOM'; 422 } elsif ( exists $params->{xmlns} && $params->{xmlns} =~ /www.w3.org\/2005\/Atom/ ) { 423 $type = 'ATOM'; 424 } 425 } elsif ( $$contref =~ /<rdf([^>]*)>/ ) { 426 $type = 'RDF'; 427 } 428 return $type; 429 } 430 431 432 sub __parse_content { 433 my ($self, $contref) = @_; 434 435 my $feed_type = $self->__feed_type($contref); 436 # warn "FEED Type = [$feed_type]\n"; 437 return undef unless $feed_type; 438 439 $$contref =~ s/>\s+</></sgi; 440 $$contref =~ s/<items(.*?)>(.*?)<\/items([^>]*?)>//sgi; 441 $$contref =~ s/<\/?br(.*?)>/\n/sgi; 442 $$contref =~ s/<\/?nobr(.*?)>//sgi; 443 #$$contref =~ s/<p>/\n\n/sgi; 444 #$$contref =~ s/<p\s(.*?)>/\n\n/sgi; 445 $$contref =~ s/<\/?strong\s(.*?)>//sgi; 446 $$contref =~ s/<\/?s>//sgi; 447 $$contref =~ s/<\/?i>//sgi; 448 $$contref =~ s/<\/?b>//sgi; 449 $$contref =~ s/<\/?strong>//sgi; 450 #$$contref =~ s/<\/p>//sgi; 451 #$$contref =~ s/<\/p\s(.*?)>//sgi; 452 my @items; 453 454 if ( $feed_type eq 'RSS' ) { 455 while ( $$contref =~ /<item(.*?)>(.*?)<\/item([^>]*?)>/sgi ) { 456 my $item_params = $1; 457 my $item_body = $2; 458 # warn "BODY: [$item_body]\n\n"; 459 my $params = $self->__parse_params ($item_params); 460 my $item = $self->__parse_item_RSS ($item_body) || {}; 461 if ( ref $params eq 'HASH' ) { 462 foreach my $key ( %$params ) { 463 if ( exists $item->{$key} && ref $item->{$key} eq 'ARRAY' ) { 464 push @{ $item->{$key} }, $params->{$key}; 465 } elsif ( exists $item->{$key} ) { 466 my @arr = ( $item->{$key}, $params->{$key} ); 467 $item->{$key} = \@arr; 468 } else { 469 $item->{$key} = $params->{$key}; 470 } 471 } 472 } 473 if ( ref $item eq 'HASH' && scalar keys %$item ) { 474 if ( exists $item->{'feedburner:origlink'} ) { 475 $item->{link} = ref $item->{'feedburner:origlink'} eq 'ARRAY' ? $item->{'feedburner:origlink'}->[0] : $item->{'feedburner:origlink'}; 476 } elsif ( !exists $item->{link} ) { 477 foreach my $key ( qw( guid ) ) { 478 if ( exists $item->{$key} ) { 479 $item->{link} = $item->{$key}; 480 last; 481 } 482 } 483 } 484 push @items, $item; 485 } 486 # warn Dumper($item); 487 } 488 } 489 if ( $feed_type eq 'RDF' ) { 490 while ( $$contref =~ /<item(.*?)>(.*?)<\/item([^>]*?)>/sgi ) { 491 my $item_params = $1; 492 my $item_body = $2; 493 # warn "BODY: [$item_body]\n\n"; 494 my $params = $self->__parse_params ($item_params); 495 my $item = $self->__parse_item_RSS ($item_body) || {}; 496 if ( ref $params eq 'HASH' ) { 497 foreach my $key ( %$params ) { 498 if ( exists $item->{$key} && ref $item->{$key} eq 'ARRAY' ) { 499 push @{ $item->{$key} }, $params->{$key}; 500 } elsif ( exists $item->{$key} ) { 501 my @arr = ( $item->{$key}, $params->{$key} ); 502 $item->{$key} = \@arr; 503 } else { 504 $item->{$key} = $params->{$key}; 505 } 506 } 507 } 508 # warn Dumper($item); 509 if ( ref $item eq 'HASH' && scalar keys %$item ) { 510 if ( !exists $item->{pubdate} ) { 511 foreach my $key ( 'prism:publicationdate', 'dc:date' ) { 512 if ( exists $item->{$key} ) { 513 $item->{pubdate} = $item->{$key}; 514 last; 515 } 516 } 517 } 518 push @items, $item; 519 } 520 } 521 } 522 if ( $feed_type eq 'ATOM' ) { 523 while ( $$contref =~ /<entry(.*?)>(.*?)<\/entry([^>]*?)>/sgi ) { 524 my $item_params = $1; 525 my $item_body = $2; 526 my $item = $self->__parse_item_ATOM ($item_body) || {}; 527 # warn Dumper($item); 528 if ( ref $item eq 'HASH' && scalar keys %$item ) { 529 if ( !exists $item->{pubdate} ) { 530 foreach my $key ( 'published', 'updated' ) { 531 if ( exists $item->{$key} ) { 532 $item->{pubdate} = $item->{$key}; 533 last; 534 } 535 } 536 } 537 push @items, $item; 538 } 539 } 540 } 541 return ( scalar @items ? \@items : undef ); 542 } 543 544 545 sub __parse_params { 546 my ($self, $params) = @_; 547 return undef unless $params; 548 549 my %params; 550 while ( $params =~ /([\w\:]+)(\s*?)=(\s*?)["'](.*?)["']/sgi ) { 551 my $name = $1; 552 my $value = $4; 553 if ( $name && $value ) { 554 $params{$name} = $value; 555 } 556 } 557 return ( scalar(keys %params) ? \%params : undef ); 558 } 559 560 561 sub __parse_item_RSS { 562 my ($self, $item_body, $debug) = @_; 563 return undef unless $item_body; 564 565 my %item; 566 # my $embedded = $self->__item_cut_rss_embedded(\$item_body); 567 # if ( ref $embedded ) { 568 # %item = %$embedded; 569 # } 570 # my $content = $self->__item_cut_rss_description(\$item_body); 571 # $item{description} = $content if $content; 572 # my $one_string_elements = $self->__item_cut_single_elements (\$item_body); 573 # if ( ref $one_string_elements eq 'ARRAY' && @$one_string_elements ) { 574 # foreach my $elem ( @$one_string_elements ) { 575 # my ($elem_name) = keys %$elem if ref $elem eq 'HASH'; 576 # if ( exists $item{$elem_name} && ref $item{$elem_name} eq 'ARRAY' ) { 577 # push @{ $item{$elem_name} }, $elem->{$elem_name}; 578 # } elsif ( exists $item{$elem_name} ) { 579 # $item{$elem_name} = [$item{$elem_name}, $elem->{$elem_name}]; 580 # } else { 581 # $item{$elem_name} = $elem->{$elem_name}; 582 # } 583 # } 584 # } 585 my $parsed = $self->__make_tree (\$item_body, $debug); 586 # warn Dumper($parsed); 587 if ( ref $parsed && exists $parsed->{1} && exists $parsed->{1}{children} && ref $parsed->{1}{children} eq 'ARRAY' ) { 588 foreach my $tag ( @{ $parsed->{1}{children} } ) { 589 if ( ref $tag->{children} eq 'ARRAY' && scalar @{ $tag->{children} } ) { 590 my %params; 591 foreach my $it ( @{ $tag->{children} } ) { 592 next unless $it->{text}; 593 if ( exists $params{$it->{type}} && ref $params{$it->{type}} eq 'ARRAY' ) { 594 push @{ $params{$it->{type}} }, $it->{text}; 595 } elsif ( exists $params{$it->{type}} ) { 596 my @arr = ( $params{$it->{type}}, $it->{text} ); 597 $params{$it->{type}} = \@arr; 598 } else { 599 $params{$it->{type}} = $it->{text}; 600 } 601 } 602 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) { 603 push @{ $item{$tag->{type}} }, \%params; 604 } elsif ( exists $item{$tag->{type}} ) { 605 my @arr = ( $item{$tag->{type}}, \%params ); 606 $item{$tag->{type}} = \@arr; 607 } else { 608 $item{$tag->{type}} = \%params; 609 } 610 } else { 611 my $body = $tag->{text} || $tag->{params}; 612 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) { 613 push @{ $item{$tag->{type}} }, $body; 614 } elsif ( exists $item{$tag->{type}} ) { 615 my @arr = ( $item{$tag->{type}}, $body ); 616 $item{$tag->{type}} = \@arr; 617 } else { 618 $item{$tag->{type}} = $body; 619 } 620 } 621 } 622 } 623 # warn Dumper(\%item); 624 return \%item; 625 } 626 627 628 sub __parse_item_ATOM { 629 my ($self, $item_body, $debug) = @_; 630 return undef unless $item_body; 631 632 my %item; 633 my $embedded = $self->__item_cut_rss_embedded(\$item_body); 634 if ( ref $embedded ) { 635 %item = %$embedded; 636 } 637 if ( exists $item{summary} ) { 638 $item{description} = delete $item{summary}; 639 } else { 640 my $summary = $self->__item_cut_atom_summary(\$item_body); 641 $item{description} = $summary if $summary; 642 } 643 my $content = $self->__item_cut_atom_content(\$item_body); 644 if ( $content && $item{description} ) { 645 $item{fulltext} = $content; 646 } elsif ( $content ) { 647 $item{description} = $content; 648 } 649 my $one_string_elements = $self->__item_cut_single_elements (\$item_body); 650 # warn Dumper ($one_string_elements); 651 if ( ref $one_string_elements eq 'ARRAY' && @$one_string_elements ) { 652 foreach my $elem ( @$one_string_elements ) { 653 my ($elem_name) = keys %$elem if ref $elem eq 'HASH'; 654 if ( exists $item{$elem_name} && ref $item{$elem_name} eq 'ARRAY' ) { 655 push @{$item{$elem_name}}, $elem->{$elem_name}; 656 } elsif ( exists $item{$elem_name} ) { 657 my @arr = ($item{$elem_name}, $elem->{$elem_name}); 658 $item{$elem_name} = \@arr; 659 } else { 660 $item{$elem_name} = $elem->{$elem_name}; 661 } 662 if ( $elem->{$elem_name}{type} =~ /^image/ ) { 663 my $enclosure = { url => $elem->{$elem_name}{href} || $elem->{$elem_name}{url}, type => $elem->{$elem_name}{type} }; 664 if ( exists $item{enclosure} && ref $item{enclosure} eq 'ARRAY' ) { 665 push @{ $item{enclosure} }, $enclosure; 666 } elsif ( exists $item{enclosure} ) { 667 my @arr = ($item{enclosure}, $enclosure); 668 $item{enclosure} = \@arr; 669 } else { 670 $item{enclosure} = $enclosure; 671 } 672 } 673 if ( $elem->{$elem_name}{type} =~ /^video/ ) { 674 my $enclosure = { url => $elem->{$elem_name}{href} || $elem->{$elem_name}{url}, type => $elem->{$elem_name}{type} }; 675 if ( exists $item{enclosure} && ref $item{enclosure} eq 'ARRAY' ) { 676 push @{ $item{enclosure} }, $enclosure; 677 } elsif ( exists $item{enclosure} ) { 678 my @arr = ($item{enclosure}, $enclosure); 679 $item{enclosure} = \@arr; 680 } else { 681 $item{enclosure} = $enclosure; 682 } 683 } 684 } 685 } 686 my $parsed = $self->__make_tree (\$item_body, $debug); 687 # warn Dumper($parsed); 688 if ( ref $parsed && exists $parsed->{1} && exists $parsed->{1}{children} && ref $parsed->{1}{children} eq 'ARRAY' ) { 689 foreach my $tag ( @{ $parsed->{1}{children} } ) { 690 if ( ref $tag->{children} eq 'ARRAY' && scalar @{ $tag->{children} } ) { 691 my %params; 692 foreach my $it ( @{ $tag->{children} } ) { 693 next unless $it->{text}; 694 if ( exists $params{$it->{type}} && ref $params{$it->{type}} eq 'ARRAY' ) { 695 push @{ $params{$it->{type}} }, $it->{text}; 696 } elsif ( exists $params{$it->{type}} ) { 697 my @arr = ( $params{$it->{type}}, $it->{text} ); 698 $params{$it->{type}} = \@arr; 699 } else { 700 $params{$it->{type}} = $it->{text}; 701 } 702 } 703 if ( exists $tag->{params} && ref $tag->{params} eq 'HASH' ) { 704 while ( my ($param, $value) = each %{ $tag->{params} } ) { 705 if ( exists $params{$param} && ref $params{$param} eq 'ARRAY' ) { 706 push @{ $params{$param} }, $value; 707 } elsif ( exists $params{$param} ) { 708 my @arr = ( $params{$param}, $value ); 709 $params{$param} = \@arr; 710 } else { 711 $params{$param} = $value; 712 } 713 } 714 } 715 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) { 716 push @{ $item{$tag->{type}} }, \%params; 717 } elsif ( exists $item{$tag->{type}} ) { 718 my @arr = ( $item{$tag->{type}}, \%params ); 719 $item{$tag->{type}} = \@arr; 720 } else { 721 $item{$tag->{type}} = \%params; 722 } 723 } else { 724 my $body = $tag->{text} || $tag->{params}; 725 if ( exists $item{$tag->{type}} && ref $item{$tag->{type}} eq 'ARRAY' ) { 726 push @{ $item{$tag->{type}} }, $body; 727 } elsif ( exists $item{$tag->{type}} ) { 728 my @arr = ( $item{$tag->{type}}, $body ); 729 $item{$tag->{type}} = \@arr; 730 } else { 731 $item{$tag->{type}} = $body; 732 } 733 } 734 } 735 my $pubDate = exists $item{issued} ? $item{issued} : exists $item{modified} ? $item{modified} : undef; 736 $item{pubdate} = $pubDate if $pubDate; 737 } 738 739 # warn Dumper(\%item); 740 return \%item; 741 } 742 743 744 sub __make_tree { 745 my ($self, $content, $debug) = @_; 746 747 my @elems = split (//,$$content); 748 # warn "CONTENT: [$$content]\n\n"; 749 my $id = 1; 750 my $level = 0; 751 my @stack; 752 my %tree = ( 753 root => { 754 id => $id++, 755 text => '', 756 type => 'root', 757 children=> [], 758 parent => undef, 759 level => $level, 760 }, 761 ); 762 my %elem_hash = ( 1 => $tree{root} ); 763 my $current = $tree{root}; 764 765 while ( @elems ) { 766 if ( $elems[0] eq '<' && $elems[1] =~ /[\!a-zA-Z]/ ) { 767 my $tag = $self->__try_tag (\@elems); 768 if ( ref $tag && $tag->{type} eq 'text' ) { 769 $current->{text} .= $tag->{content}; 770 splice @elems, 0, $tag->{count}; 771 # warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n"; 772 } elsif ( ref $tag && exists $tag->{closed} && $tag->{closed} ) { 773 $tag->{id} = $id++; 774 $tag->{parent} = $current; 775 $tag->{level} = $level+1; 776 $elem_hash{$tag->{id}} = $tag; 777 push @{$current->{children}}, $tag; 778 splice @elems, 0, $tag->{count}; 779 # warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n"; 780 } elsif ( ref $tag ) { 781 $tag->{id} = $id++; 782 $tag->{children} = []; 783 $tag->{parent} = $current; 784 $tag->{level} = ++$level; 785 $elem_hash{$tag->{id}} = $tag; 786 push @{$current->{children}}, $tag; 787 push @stack, $current; 788 $current = $tag; 789 splice @elems, 0, $tag->{count}; 790 # warn "Tag: [".$current->{type}."]\n Text added:[".$tag->{content}."]\n"; 791 } else { 792 # warn "!!!! Error: RSS analyse. Job on item broken... !!!!\n" if $debug; 793 return undef; 794 } 795 } elsif ( $elems[0] eq '<' && $elems[1] =~ /\// ) { 796 my $tag = $self->__try_end (\@elems); 797 if ( ref $tag && $tag->{type} eq 'text' ) { 798 $current->{text} .= $tag->{content}; 799 $current->{count} += $tag->{count}; 800 splice @elems, 0, $tag->{count}; 801 } elsif ( ref $tag ) { 802 if ( $current->{type} ne $tag->{type} ) { 803 # warn "!!!!Wrong tag type for closing. It's [$tag->{type}]. It must be [$current->{type}]!!!!\n" if $debug; 804 return undef; 805 } else { 806 $current = pop @stack; 807 $level = $current->{level}; 808 # warn "Text place: [".substr($current->{text}, 0, 20)."]\n" if exists $current->{text}; 809 # warn "Close type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n"; 810 } 811 splice @elems, 0, $tag->{count}; 812 } else { 813 # warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug; 814 return undef; 815 } 816 } else { 817 $current->{text} .= shift @elems; 818 $current->{count}++; 819 } 820 } 821 return (\%elem_hash); 822 } 823 824 825 826 sub __try_tag { 827 my ($self, $content) = @_; 828 829 my $i = 1; 830 my %tag; 831 my $tag = $content->[0]; 832 if ( $content->[$i] eq '!' ) { 833 # warn "What? Think it's CDATA\n"; 834 my $try_cdata = join '', @$content[1..8]; 835 if ( $try_cdata eq '![CDATA[' ) { 836 $tag = ''; 837 $i = 9; 838 while ( !($content->[$i-1] eq '>' && $content->[$i-2] eq ']' && $content->[$i-3] eq ']') && $i < scalar @$content ) { 839 $tag .= $content->[$i]; 840 $i++; 841 } 842 chop $tag; chop $tag; chop $tag; 843 } 844 # warn "CDATA Found: [$tag]"; 845 return { 846 type => 'text', 847 content => $tag, 848 count => $i, 849 }; 850 } 851 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) { 852 $tag .= $content->[$i]; 853 $i++; 854 } 855 if ( $content->[$i] eq '<' || $i >= scalar @$content ) { 856 return { 857 type => 'text', 858 content => $tag, 859 count => $i, 860 }; 861 } else { 862 if ( $tag =~ /^<([\w:-]+)\s*(.*)/si ) { 863 my $elem_name = $1; 864 my $elem_body = $2; 865 unless ( $self->__is_valid_tag ($elem_name) ) { 866 return { 867 type => 'text', 868 content => $tag, 869 count => $i, 870 }; 871 } else { 872 my $params = $self->__parse_params ($elem_body) if $elem_body; 873 if ( $content->[$i] eq '>' && $content->[$i-1] eq '/' ) { 874 $tag{closed} = 1; 875 } 876 $tag{type} = lc($elem_name); 877 $tag{count} = $i+1; 878 $tag{params} = $params if ref $params; 879 return \%tag; 880 } 881 } else { 882 return { 883 type => 'text', 884 content => $tag, 885 count => $i, 886 }; 887 } 888 } 889 } 890 891 892 sub __try_end { 893 my ($self, $content) = @_; 894 895 my $i = 2; 896 my %tag; 897 my $tag = $content->[0].$content->[1]; 898 while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) { 899 $tag .= $content->[$i]; 900 $i++; 901 } 902 if ( $content->[$i] eq '<' || $i >= scalar @$content ) { 903 return { 904 type => 'text', 905 content => $tag, 906 count => $i, 907 }; 908 } else { 909 if ( $tag =~ /^<\/([\w:-]+)/i ) { 910 my $elem_name = $1; 911 unless ( $self->__is_valid_tag ($elem_name) ) { 912 return { 913 type => 'text', 914 content => $tag, 915 count => $i, 916 }; 917 } else { 918 $tag{type} = lc($elem_name); 919 $tag{count} = $i+1; 920 return \%tag; 921 } 922 } else { 923 return { 924 type => 'text', 925 content => $tag, 926 count => $i, 927 }; 928 } 929 } 930 } 931 932 933 sub __is_valid_tag { 934 my ($self, $tag) = @_; 935 foreach my $invtag ( @INVALID_TAGS ) { 936 return 0 if lc($invtag) eq lc($tag); 937 } 938 return 1; 939 } 940 941 942 sub __item_cut_atom_content { 943 my ($self, $item_body) = @_; 944 945 my %elem; 946 if ( $$item_body =~ /<content([^>]*?)>(.*?)<\/content([^>]*)>/si ) { 947 my $content_params = $1; 948 my $content_body = $2; 949 my $params = $self->__parse_params ($content_params) if $content_params; 950 $$item_body =~ s/<content([^>]*?)>(.*?)<\/content([^>]*)>//si; 951 return $content_body; 952 } 953 } 954 955 956 sub __item_cut_atom_summary { 957 my ($self, $item_body) = @_; 958 959 my %elem; 960 if ( $$item_body =~ /<summary([^>]*)>(.*?)<\/summary([^>]*)>/si ) { 961 my $content_params = $1; 962 my $content_body = $2; 963 my $params = $self->__parse_params ($content_params) if $content_params; 964 $$item_body =~ s/<summary([^>]*)>(.*?)<\/summary([^>]*)>//si; 965 return $content_body; 966 } 967 } 968 969 970 sub __item_cut_rss_description { 971 my ($self, $item_body) = @_; 972 973 my %elem; 974 if ( $$item_body =~ /<description([^>]*?)>(.*?)<\/description([^>]*)>/si ) { 975 my $content_params = $1; 976 my $content_body = $2; 977 my $params = $self->__parse_params ($content_params) if $content_params; 978 $$item_body =~ s/<description([^>]*?)>(.*?)<\/description([^>]*)>//si; 979 return $content_body; 980 } 981 } 982 983 984 sub __item_cut_rss_embedded { 985 my ($self, $item_body) = @_; 986 987 my %elem; 988 while ( $$item_body =~ /<([^>]*?)>\s*<!\[CDATA\[(.*?)\]\]>\s*<\/([^>]*)>/sgi ) { 989 my $tag = $3; 990 my $content_body = $2; 991 my $content_params = $1; 992 if ( $content_params =~ /([\w:-]+)\s+(.*)/ ) { 993 $tag = 1; 994 $content_params = $2; 995 } 996 my $params = $self->__parse_params ($content_params) if $content_params; 997 $elem{$tag} = $content_body; 998 $$item_body =~ s/<$tag([^>]*?)>(.*?)<\/$tag([^>]*)>//si; 999 } 1000 return scalar keys %elem ? \%elem : undef; 1001 } 1002 1003 1004 1005 sub __item_cut_single_elements { 1006 my ($self, $item_body) = @_; 1007 1008 my @elems; 1009 while ( $$item_body =~ /<([\w\:\-]+)\s*([^>]*?)\/>/sgi ) { 1010 my $elem_name = $1; 1011 my $elem_body = $2; 1012 my $params = $self->__parse_params ($elem_body) if $elem_body; 1013 if ( $elem_name && ref $params ) { 1014 push @elems, { $elem_name => $params } 1015 } 1016 } 1017 $$item_body =~ s/<(\w+)\s*([^>]*?)\/>//sgi; 1018 return ( @elems ? \@elems : undef ); 1019 } 1020 1021 1022 sub __field_prepare { 1023 my ($self, $text) = @_; 1024 1025 # $text =~ s/^[\n\r\x20\t]+//; 1026 $self->__cdata (\$text); 1027 $self->__extchar (\$text); 1028 $text = HTML::Entities::decode_entities($text); 1029 1030 # Remove linebreaks inside incorrectly breaked paragraphs 1031 if (length($text) > 100) { 1032 my $pcount = 0; 1033 while ($text =~ /<p>(.+?)(?=<\/?p>|$)/sgi) { 1034 my $p = $1; 1035 if (length $p > 50) { 1036 my ($dcount, $ndcount) = (); 1037 # Count sentences normally ended vs breaked 1038 $dcount++ while $p =~ /(\.|\?|\!)['"]?\s*[\r\n]+/g; 1039 $ndcount++ while $p =~ /([^\.\?\!\s])\s*[\r\n]+/g; 1040 # Found broken paragraph 1041 last if $ndcount > $dcount and ++$pcount > 1; 1042 } 1043 } 1044 if ($pcount > 0) { 1045 $text =~ s/[\n\r]+/ /sg; 1046 } 1047 } 1048 $text =~ s/<br[^>]*>/\n/sgi; 1049 $text =~ s/<p\s*>/\n\n/sgi; 1050 $text =~ s/<\/p\s*>//sgi; 1051 $text = rchannel::Parser::Util::strip_html($text); 1052 $text = rchannel::Parser::Util::text_cleanup($text); 1053 return $text; 1054 } 1055 1056 1057 sub __extchar { 1058 my ($self, $textref) = @_; 1059 1060 for ( $$textref ) { 1061 s/&/\&/sg; 1062 s/\&/\&/sgi; 1063 s/«/«/sg; 1064 s/»/»/sg; 1065 s/£/£/sg; 1066 s/–/–/sg; 1067 s/—/—/sg; 1068 s/„/"/sg; 1069 s/“/"/sg; 1070 s/”/"/sg; 1071 s/´/'/sg; 1072 s/…/\.\.\./sg; 1073 s/ /\n/sg; 1074 s/"/"/sg; 1075 s/\xA0/\x20/sg; 1076 } 1077 # $$textref =~ s/&#(\d+);/{'&#'.__normalise($1).';'}/eg; 1078 # $$textref =~ s/>/>/sgi; 1079 # $$textref =~ s/</</sgi; 1080 # $$textref =~ s/"/"/sgi; 1081 # $$textref =~ s/«/«/sgi; 1082 # $$textref =~ s/»/»/sgi; 1083 # $$textref =~ s/©/©/sgi; 1084 # $$textref =~ s/–/–/sgi; 1085 # $$textref =~ s/—/—/sgi; 1086 # $$textref =~ s/°/º/sgi; 1087 # $$textref =~ s/ /\x20/sgi; 1088 } 1089 1090 sub __normalise { 1091 my $chr = shift; 1092 return sprintf("%04d",$chr); 1093 } 1094 1095 sub __cdata { 1096 my ($self, $textref) = @_; 1097 if ( $$textref =~ /^<\!\[CDATA\[/ ) { 1098 $$textref =~ s/<\!\[CDATA\[//sgi; 1099 $$textref =~ s/\]\]>//sgi; 1100 } 1101 } 1102 1103 1104 sub __parse_rools { 1105 my ($self, $rools) = @_; 1106 return unless $rools; 1107 $rools =~ s/\r//sgi; 1108 my @rools = split /\n/, $rools; 1109 return unless @rools; 1110 1111 my @parsed; 1112 foreach my $rool ( @rools ) { 1113 my %pr; 1114 next if $rool =~ /^#/; 1115 $rool =~ s/[\x20\t]+$//; 1116 $rool =~ s/^[\x20\t]+//; 1117 if ( $rool =~ /^([\w']+)\s+(.*)$/ || $rool =~ /^(\w+)(.*)$/ ) { 1118 $pr{action} = lc($1); 1119 my $params = $2; 1120 if ( $pr{action} eq 'use' && $params =~ /^(current)\s+(date)$/ ) { 1121 $pr{action} = 'rewrite'; 1122 $pr{target} = 'pubdate'; 1123 $pr{command} = 'set'; 1124 $pr{condition} = 'CURRENT_DATETIME'; 1125 push @parsed, \%pr; 1126 } elsif ( $params =~ /^(\w+)\s+(.*)$/ ) { 1127 $pr{target} = lc($1); 1128 $params = $2; 1129 if ( $params =~ /^([\w=]+)\s+(.*)$/ ) { 1130 $pr{command} = lc($1); 1131 $params = $2; 1132 if ( $pr{action} eq 'filter' && $pr{command} eq 'cut' && $params =~ /^(\w+)\s+(.*)$/ ) { 1133 $pr{subcommand} = lc($1); $params = $2; 1134 next unless $pr{subcommand} =~ /^(untill|till|from|off|regex)$/; 1135 $params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex'; 1136 $pr{condition} = $params; 1137 } elsif ( $pr{action} eq 'filter' && $pr{command} eq 'regex' && substr($params,0,1) eq substr($params,-1,1) && substr($params,0,1) =~ /([\/\#\|])/ ) { 1138 my $delim = $1; 1139 $params = substr($params,1,length($params)-2); 1140 my @params = split(//,$params); 1141 my ($from, $to) = ('',''); 1142 my $prev = ''; 1143 while ( @params ) { 1144 my $ch = shift @params; 1145 if ( $ch eq $delim && $prev ne '\\' ) { 1146 last; 1147 } else { 1148 $prev = $ch; 1149 $from .= $ch; 1150 } 1151 } 1152 $to = join ('', @params); 1153 $pr{condition} = { from => $from, to => $to }; 1154 } elsif ( ($pr{action} eq 'ignore' || $pr{action} eq 'only') && $pr{command} =~ /^(regex|=|contain|contains)$/ ) { 1155 $params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex'; 1156 $pr{condition} = $params; 1157 } else { 1158 next; 1159 } 1160 push @parsed, \%pr; 1161 } 1162 } 1163 } 1164 } 1165 return ( scalar @parsed ? \@parsed : undef ); 1166 } 1167 1168 1169 sub __parse_related { 1170 my ($self, $related) = @_; 1171 return unless ref $related eq 'HASH'; 1172 return unless exists $related->{url} && $related->{url} =~ /^http:\/\//i; 1173 return unless exists $related->{rel} && $related->{rel} =~ /(news|discussion|teaser)/; 1174 my $result = { url => $related->{url}, rel => $related->{rel} }; 1175 $result->{type} = $related->{type} if exists $related->{type}; 1176 $result->{title} = $self->__field_prepare($related->{title}) if exists $related->{title} && $related->{title}; 1177 1178 $result->{author} = $self->__field_prepare($related->{author}) if exists $related->{author} && $related->{author}; 1179 $result->{description} = $self->__field_prepare($related->{description}) if exists $related->{description} && $related->{description}; 1180 1181 if ( exists $related->{pubdate} && $related->{pubdate} ) { 1182 my $pubdate = Class::Date::localdate(Date::Parse::str2time($related->{pubdate})); 1183 $result->{pubdate} = $pubdate if $pubdate; 1184 } 1185 if ( $related->{rel} =~ /(news|teaser)/ ) { 1186 return undef unless $result->{title} && $result->{pubdate}; 1187 } else { 1188 $result->{title} ||= 'Обсудить'; 1189 } 1190 1191 if ( exists $related->{image} && $related->{image} ) { 1192 if ( ref $related->{image} eq 'HASH' && (exists $related->{image}{url} || exists $related->{image}{href}) ) { 1193 my $img = rchannel::Image->new( { src => ($related->{image}{url} || $related->{image}{href}) } ); 1194 $result->{image} = $img if ref $img; 1195 } elsif ( !ref $related->{image} ) { 1196 my $img = rchannel::Image->new( { src => $related->{image} } ); 1197 $result->{image} = $img if ref $img; 1198 } 1199 } 1200 1201 return $result; 1202 } 1203 1204 1205 # TODO IMAGES: 1206 # enclosure 1207 # media:content 1208 # media:thumbnail 1209 # image 1210 # img 1211 1212 # FOUDNED: 1213 # author 1214 # category 1215 # comments 1216 # content 1217 # content:encoded 1218 # content:format 1219 # dc:creator 1220 # dc:date 1221 # dc:rights 1222 # dc:subject 1223 # description 1224 # enclosure 1225 # feedburner:awareness 1226 # feedburner:origLink 1227 # full-text 1228 # fulltext 1229 # guid 1230 # guide 1231 # habrahabr:ballsCount 1232 # habrahabr:commentsCount 1233 # id 1234 # image 1235 # img 1236 # link 1237 # media:content 1238 # media:thumbnail 1239 # pdalink 1240 # pubDate 1241 # pubdate 1242 # pubid 1243 # published 1244 # rambler:full-text 1245 # rambler:fulltext 1246 # region 1247 # section 1248 # sections 1249 # source 1250 # sport 1251 # summary 1252 # text 1253 # title 1254 # updated 1255 # wfw:commentRSS 1256 # wfw:commentRss 1257 # wmj:fulltext 1258 # yandex:full-text 1259 1260 1; -
utf8/core/lib/Contenido/Parser/Util.pm
1 package Contenido::Parser::Util; 2 3 use strict; 4 5 sub clean_invalid_chars { # http://www.w3.org/TR/REC-xml/#NT-Char 6 my ($cont_ref) = shift; 7 $$cont_ref =~ s/[\x0-\x8|\xB\xC|\xE-\x1F|\x{d800}-\x{dfff}|\x{fffe}\x{ffff}]//sgi; 8 } 9 10 sub text_cleanup { 11 my $text = shift; 12 my $delim = shift || "\n\n"; 13 14 $text =~ s/^\s+//; $text =~ s/\s+$//; 15 $text =~ s/\r\n/\n/g; 16 17 my @paragfs = $text =~ /\n{2,}/ ? # is paragraphs detected? 18 split /\n{2,}/, $text : # - by paragraphs only 19 split /\n+/, $text; # - by any newline 20 21 for (@paragfs) { 22 s/^\s+//mg; s/\s+$//mg; # trim whitespace 23 s/[[:blank:]]+/ /g; # collapse spaces 24 } 25 26 return join "\n\n", grep length $_, @paragfs; 27 } 28 29 1; -
utf8/core/lib/Utils/HTML.pm
269 269 $t1 = $t2 = join ' ', @words[0 .. $args{max_words}-1]; 270 270 271 271 # magic ! 272 my @wds = split ' ', $t1; 273 return $t1 if $t1 =~ s/^(.+[\w»")]{3,}[.!?])+\s?[А-ЯA-Z«"].+?$/$1/ and scalar(@wds) > $args{min_words}; 272 s/^(.+\w{3,}[»")]?[.!?]+)\s*[А-ЯA-Z«"].+?$/$1/s and (()=/(\s+)/g)>$args{min_words} and return$_ for $t1; 274 273 275 274 $t2 =~ s/[.,:;!?\s—-]+$//; 276 275 $t2.($args{ending} || '');