Revision 99
Date:
2011/05/11 18:09:30
Author:
ahitrov
Revision Log:
Парсер
Files:
Legend:
Added
Removed
Modified
utf8/core/lib/Contenido/Parser.pm
1
package Contenido::Parser;
2
3
use strict;
4
use warnings;
5
use locale;
6
7
use Encode;
8
use URI;
9
use Data::Dumper;
10
use Contenido::Globals;
11
use LWP::UserAgent;
12
use Contenido::File::Scheme::FILE;
13
use Contenido::Parser::Util;
14
15
sub fetch {
16
my ($self, $input, %opts) = @_;
17
18
my ($fh, $content);
19
my $encoding = delete $opts{encoding};
20
if (not ref $input) {
21
no strict "refs";
22
my $scheme = uc(scheme($input));
23
if ( $scheme eq 'FILE' ) {
24
$fh = &{"Contenido::File::Scheme::".uc(scheme($input))."::get_fh"}($input);
25
} else {
26
my $request = new HTTP::Request GET => $input;
27
my $ua = new LWP::UserAgent;
28
$ua->timeout(10);
29
my $res = $ua->request($request);
30
if ($res->is_success) {
31
$self->{headers} = $res->headers;
32
my $content_length = $res->headers->header('content-length');
33
my $content_type = $res->headers->header('content-type');
34
$self->{content_type} = $content_type;
35
if ( $content_type =~ /charset\s*=\s*([a-z\d\-]+)/i ) {
36
$encoding = $1;
37
}
38
my $base_url = $input =~ /^([a-z]+:\/\/[a-z\.\d]+)/ ? $1 : '';
39
$self->{base_url} = $base_url if $base_url;
40
$content = $res->content;
41
} else {
42
warn $res->status_line." \n";
43
$self->{success} = 0;
44
$self->{reason} = $res->status_line;
45
return $self;
46
}
47
}
48
} elsif ((ref $input eq "GLOB") or (ref $input eq 'Apache::Upload') or (ref $input eq 'IO::File')) {
49
$fh = $input;
50
} elsif (ref $input eq "SCALAR") {
51
$fh = IO::Scalar->new($input);
52
} else {
53
warn("Path, scalar ref or fh needed");
54
$self->{success} = 0;
55
$self->{reason} = 'Path, scalar ref or fh needed';
56
return $self;
57
}
58
59
if ( ref $fh ) {
60
$content = <$fh>;
61
}
62
if ( $content ) {
63
warn Dumper($self);
64
unless ( $encoding ) {
65
$encoding = $self->__try_content_encoding( substr($content, 0, 350) );
66
$self->{encoding} = $encoding;
67
if ( $encoding && $encoding ne 'utf-8' ) {
68
Encode::from_to($content, $encoding, 'utf-8');
69
}
70
}
71
$self->{content} = $content;
72
$self->{success} = 1;
73
} else {
74
$self->{success} = 0;
75
$self->{reason} = 'Content is empty';
76
}
77
return $self;
78
}
79
80
sub is_success {
81
my ($self, $val) = @_;
82
83
if ( defined $val ) {
84
$self->{success} = $val;
85
return $self;
86
} else {
87
return $self->{success};
88
}
89
}
90
91
sub __try_content_encoding {
92
my ($self, $input)= @_;
93
if ( $input =~ /encoding[\ ]?=[\ ]?[\"\']?([a-z\-\d]+)/i ) {
94
return lc($1);
95
} elsif ( $input =~ /(utf-8|windows-1251|koi8-r)/i ) {
96
return lc($1);
97
} else {
98
return undef;
99
}
100
}
101
102
sub scheme {
103
my $uri = shift;
104
my $scheme;
105
106
$scheme = URI->new($uri)->scheme() || "file";
107
108
return $scheme;
109
}
110
111
112
1;
utf8/core/lib/Contenido/Parser/HTML.pm
1
package Contenido::Parser::HTML;
2
3
use strict;
4
use warnings;
5
use locale;
6
7
use base 'Contenido::Parser';
8
9
use Contenido::Globals;
10
use Utils::HTML;
11
use Data::Dumper;
12
use utf8;
13
14
15
my @PICNAME = qw ( top menu topmenu home line dot mail razdel button find search srch delivery
16
head bar label phone bottom bottommenu ico icon post left right service caption arr arrow cart
17
basket main reply title corner address page buy pix pixel spacer fon welcome razd about back
18
shapka phones print tel phpBB uho korz korzina raspisanie shop login blank telephone telephones
19
dealer diler background bg news rss index none btn cards up footer noimage but link excel price
20
mid graphic busket map girl space catalog bann headline hosting contact schedule redir email
21
);
22
23
my @PICHOST = qw ( top.list.ru addweb.ru adland.ru extreme-dm.com top100.rambler.ru
24
mypagerank.ru informer.gismeteo.ru lux-bn.com.ua link-txt.com myrating.ljseek.com c.bigmir.net
25
);
26
27
my @PICURL = qw ( rorer counter count ljplus yadro spylog hotlog banner baner ban banners ban
28
icq mirabilis adriver advertising ad adv ads adview advert weather imho awaps reklama stat cnt
29
ipz design icons promo cycounter captcha foto_hit header random adcycle rssfeed bansrc
30
);
31
32
33
my @bad_dimensions = (
34
{ w => 120, h => 60 },
35
{ w => 468, h => 60 },
36
{ w => 120, h => 600 },
37
{ w => 88, h => 31 },
38
);
39
40
41
sub new {
42
my ($proto) = @_;
43
my $class = ref($proto) || $proto;
44
my $self = {};
45
bless $self, $class;
46
47
return $self;
48
}
49
50
51
sub parse {
52
my ($self, %opts) = @_;
53
54
my $content;
55
if ( $opts{content} ) {
56
$content = delete $opts{content};
57
delete $self->{content};
58
} elsif ( $self->{success} || $self->{content} ) {
59
$content = delete $self->{content};
60
} else {
61
$self->{success} = 0;
62
return $self;
63
}
64
65
my $base_url = delete $self->{base_url} || delete $opts{base_url};
66
my $strip_html = delete $opts{strip_html};
67
my $debug = $DEBUG;
68
my $gui = delete $opts{gui};
69
my $header = delete $opts{header};
70
my $description = delete $opts{description};
71
my $minimum = delete $opts{min} || length $description;
72
73
my $pre_rools = $self->__parse_rools (delete $opts{parser_pre});
74
warn Dumper ($pre_rools) if $debug;
75
my $parse_rools = $self->__parse_rools (delete $opts{parser_run});
76
warn Dumper ($parse_rools) if $debug;
77
my $post_rools = $self->__parse_rools (delete $opts{parser_end});
78
warn Dumper ($post_rools) if $debug;
79
80
# warn "Experimental. Debug!!!\n" if $debug;
81
if ( ref $pre_rools eq 'ARRAY' ) {
82
my @sets = grep { $_->{command} eq 'set' } @$pre_rools;
83
foreach my $set ( @sets ) {
84
if ( $set->{condition}{param} eq 'min' || $set->{condition}{param} eq 'minimum' ) {
85
my $value = $set->{condition}{value};
86
unless ( $value =~ /\D/ ) {
87
if ( $set->{subcommand} eq 'limit' ) {
88
$minimum = $minimum && $minimum > int($value) ? int($value) : $minimum ? $minimum : int($value);
89
} else {
90
$minimum = int($value);
91
}
92
}
93
}
94
if ( $set->{condition}{param} eq 'description' && $set->{condition}{value} eq 'header' ) {
95
$description = $header;
96
}
97
}
98
}
99
$minimum ||= 300;
100
101
warn "Tag cleaning...\n" if $debug;
102
$self->__clean_tags (\$content, $pre_rools);
103
$content =~ s/>\s+</></sgi;
104
warn "Image cleaning...\n" if $debug;
105
$self->__clean_img (\$content);
106
warn "Empty div cleaning...\n" if $debug;
107
while ( $self->__clean_empty_div (\$content) ) {}
108
warn "Make tree...\n" if $debug;
109
my ($tree, $shortcuts) = $self->__make_tree (\$content, $parse_rools, $debug);
110
111
$self->__extract_img ($shortcuts, $base_url, $debug);
112
$self->__extract_headers ($shortcuts, $header, $debug);
113
warn "Getting big texts (min=$minimum)...\n" if $debug;
114
my $chosen = $self->__dig_big_texts (
115
structure => $shortcuts,
116
min => $minimum,
117
ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (),
118
debug => $debug );
119
unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
120
$self->{error_message} = 'Nothing was found at all!!! Check your MINIMUM value';
121
return $self->is_success(0) unless $gui;
122
}
123
@$chosen = sort { $a->{id} <=> $b->{id} } @$chosen;
124
if ( $description ) {
125
my @use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$parse_rools if ref $parse_rools eq 'ARRAY';
126
$chosen = $self->__check_description ($chosen, $description, $debug) unless @use_rools;
127
}
128
unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
129
$self->{error_message} = 'I didn\'t find any valuable text';
130
return $self->is_success(0) unless $gui;
131
}
132
if ( scalar @$chosen > 1 ) {
133
$chosen = $self->__check_headers ($chosen, $header, $debug);
134
}
135
unless ( ref $chosen eq 'ARRAY' && @$chosen ) {
136
$self->{error_message} = 'I didn\'t find any valuable text';
137
return $self->is_success(0) unless $gui;
138
}
139
$self->__strip_html (
140
chosen => $chosen,
141
header => $header,
142
ref $post_rools eq 'ARRAY' && @$post_rools ? (rools => $post_rools) : (),
143
debug => $debug
144
);
145
if ( ref $parse_rools eq 'ARRAY' ) {
146
my ($glue) = grep { $_->{command} eq 'glue' } @$parse_rools;
147
$self->__glue ( $chosen, $glue, $debug ) if ref $glue;
148
}
149
my $images = $self->__get_images (
150
structure => $shortcuts,
151
chosen => $chosen->[0],
152
base_url => $base_url,
153
ref $parse_rools eq 'ARRAY' && @$parse_rools ? (rools => $parse_rools) : (),
154
debug => $debug,
155
);
156
if ( ref $images eq 'ARRAY' && @$images ) {
157
$self->images($images);
158
$self->image($images->[0]);
159
}
160
161
if ( $gui ) {
162
if ( ref $chosen eq 'ARRAY' ) {
163
foreach my $elem ( @$chosen ) {
164
$self->__post_rool ($elem, $post_rools, $description);
165
}
166
}
167
$self->{text} = ref $chosen eq 'ARRAY' ? $chosen->[0] : $chosen;
168
$self->{html} = $content;
169
$self->{tree} = $shortcuts;
170
$self->{chosen} = $chosen;
171
} else {
172
$self->__post_rool ($chosen->[0], $post_rools, $description);
173
$self->{text} = Contenido::Parser::Util::text_cleanup($chosen->[0]->{text});
174
$tree = undef;
175
foreach my $key ( keys %$shortcuts ) {
176
delete $shortcuts->{$key};
177
}
178
$shortcuts = undef;
179
$content = undef;
180
}
181
return $self->is_success(1);
182
}
183
184
sub __clean_tags {
185
my ($self, $content, $rools) = @_;
186
187
my @cut_rools;
188
if ( ref $rools eq 'ARRAY' && @$rools) {
189
@cut_rools = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'cut' } @$rools;
190
}
191
my @clean_off_rools;
192
if ( ref $rools eq 'ARRAY' && @$rools) {
193
@clean_off_rools = grep { $_->{command} eq 'clean' && $_->{subcommand} eq 'off' } @$rools;
194
}
195
$$content =~ s/<!DOCTYPE(.*?)>//sgi;
196
$$content =~ s/<!--(.*?)-->//sgi;
197
$$content =~ s/<script(.*?)<\/script>//sgi;
198
$$content =~ s/<hr(.*?)>//sgi;
199
$$content =~ s/<noscript(.*?)<\/noscript>//sgi;
200
$$content =~ s/<iframe(.*?)<\/iframe>//sgi;
201
unless ( grep { $_->{condition}{param} eq 'tag' && $_->{condition}{value} eq 'noindex' } @cut_rools ) {
202
$$content =~ s/<noindex(.*?)<\/noindex>//sgi;
203
} else {
204
$$content =~ s/<\/?noindex(.*?)>//sgi;
205
}
206
$$content =~ s/<object(.*?)<\/object>//sgi;
207
$$content =~ s/<embed(.*?)<\/embed>//sgi;
208
$$content =~ s/<style(.*?)<\/style>//sgi;
209
if ( grep { $_->{condition}{param} eq 'tag' && $_->{condition}{value} eq 'form' } @cut_rools ) {
210
$$content =~ s/<select(.*?)<\/select([^>]*?)>//sgi;
211
$$content =~ s/<textarea(.*?)<\/textarea([^>]*?)>//sgi;
212
$$content =~ s/<input([^>]*?)>//sgi;
213
} else {
214
$$content =~ s/<form(.*?)<\/form>//sgi;
215
}
216
foreach my $rool ( @clean_off_rools ) {
217
next unless $rool->{condition}{param} eq 'tag';
218
my $tag = $rool->{condition}{value};
219
$$content =~ s/<$tag(.*?)<\/$tag>//sgi;
220
}
221
$$content =~ s/<head(.*?)<\/head>//sgi;
222
$$content =~ s/\ style="(.*?)"//sgi;
223
224
$$content =~ s/<\/?span(.*?)>//sgi;
225
$$content =~ s/<\/?font(.*?)>//sgi;
226
$$content =~ s/<br(.*?)>/\n/sgi;
227
$$content =~ s/<link(.*?)>//sgi;
228
$$content =~ s/<spacer(.*?)>//sgi;
229
$$content =~ s/<\!\?(.*?)\?>//sgi;
230
# $$content =~ s/<a\s*?(.*?)>/\n/sgi;
231
$$content =~ s/<\/p\s*>//sgi;
232
# $$content =~ s/<\/a\s*>//sgi;
233
$$content =~ s/<p\s*(.*?)>/\n\n/sgi;
234
$$content =~ s/onclick="(.*?)"//sgi;
235
$$content =~ s/onload="(.*?)"//sgi;
236
237
}
238
239
sub __clean_img {
240
my ($self, $content) = @_;
241
242
my @garbage;
243
my $i = 1;
244
245
while ( $$content =~ /(<img.*?>)/sgi ) {
246
my $img = $1;
247
my $src;
248
if ( $img =~ /src=([^\x20|^>]+)/i ) {
249
$src = $1;
250
}
251
my ($w, $h);
252
if ( $img =~ /width\s*=\s*["'](\d+)/i || $img =~ /width\s*=\s*(\d+)/i ) {
253
$w = $1;
254
}
255
if ( $img =~ /height\s*=\s*["'](\d+)/i || $img =~ /height\s*=\s*(\d+)/i ) {
256
$h = $1;
257
}
258
my $delim = 0;
259
if ( $w && $h ) {
260
foreach my $pair ( @bad_dimensions ) {
261
if ($w == $pair->{w} && $h == $pair->{h}) {
262
$delim = 10;
263
last;
264
}
265
}
266
$delim = ( $w >= $h ? $w : $h ) / ( $w >= $h ? $h : $w ) unless $delim;
267
}
268
my $bad_name = __check_img_name ( $src );
269
if ( $bad_name || ($w && $w < 80) || ($h && $h < 80) || ( $w && $h && ($delim > 2.5) ) ) {
270
# warn "Bad name: [$src]\n";
271
push @garbage, $src;
272
}
273
}
274
275
foreach my $src (@garbage) {
276
$src =~ s|([*+?()/\\\$\[\]])|\\$1|sg;
277
$$content =~ s/<img([^>]*?)src=$src([^>]+)>//si;
278
}
279
}
280
281
282
sub __check_img_name {
283
my $name = shift;
284
my $test = $1 if $name =~ /\/([^\/]+)$/;
285
if ( $test =~ /\d+[x-]\d+/ || $test =~ /\.gif$/i ) {
286
return 1;
287
}
288
foreach my $word ( @PICNAME ) {
289
if ( $test =~ /^$word/si || $test =~ /[^a-z]$word[^a-z]/si ) {
290
return 1;
291
}
292
}
293
foreach my $word ( @PICURL ) {
294
if ( $name =~ /^$word/si || $name =~ /[^a-z]$word[^a-z]/si ) {
295
return 1;
296
}
297
}
298
foreach my $word ( @PICHOST ) {
299
if ( index (lc($name), $word) >= 0 ) {
300
return 1;
301
}
302
}
303
return 0;
304
}
305
306
307
sub __clean_empty_div {
308
my ($self, $content) = @_;
309
310
my $i = 0;
311
while ( $$content =~ s/(<div[^>]*?><\/div\s*>)//sgi ) {
312
$i++;
313
}
314
315
return $i;
316
}
317
318
319
sub __make_tree {
320
my ($self, $content, $rools, $debug) = @_;
321
322
my @elems = split (//,$$content);
323
my @collaborate;
324
if ( ref $rools eq 'ARRAY' && @$rools) {
325
@collaborate = grep { $_->{command} eq 'collaborate' } @$rools;
326
}
327
my %hierarchy = ( div => 0, td => 1, tr => 2, table => 3, body => 4, html => 5 );
328
my $id = 1;
329
my $level = 0;
330
my %tree = (
331
root => {
332
id => $id++,
333
text => '',
334
type => 'root',
335
children=> [],
336
parent => undef,
337
level => $level,
338
},
339
);
340
my @stack;
341
my %elem_hash = ( 1 => $tree{root} );
342
my $current = $tree{root};
343
my $previous;
344
345
while ( @elems ) {
346
if ($elems[0] =~ /[\ \t]/ && !$current){
347
shift @elems;
348
next;
349
}
350
if ( $elems[0] eq '<' && $elems[1] =~ /[a-zA-Z]/ ) {
351
my $tag = $self->__try_tag (\@elems);
352
if ( ref $tag && $tag->{type} eq 'text' ) {
353
my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef;
354
if ( ref $last_text_tag ) {
355
$last_text_tag->{text} .= $tag->{content};
356
$last_text_tag->{count} += $tag->{count};
357
} else {
358
$last_text_tag = $tag;
359
$last_text_tag->{id} = $id++;
360
$last_text_tag->{type} = 'text';
361
$last_text_tag->{parent} = $current;
362
$last_text_tag->{level} = $level+1;
363
$elem_hash{$last_text_tag->{id}} = $last_text_tag;
364
push @{$current->{children}}, $last_text_tag;
365
$current->{text_count}++;
366
}
367
$current->{text_value} += $tag->{count};
368
splice @elems, 0, $tag->{count};
369
} elsif ( ref $tag ) {
370
if ( ($current->{type} eq 'td' || $current->{type} eq 'tr' ) && $tag->{type} eq 'tr' ) {
371
# warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback till table begin... !!!!\n" if $debug;
372
do {
373
$current = pop @stack;
374
$level = $current->{level};
375
# warn "New current type: /$current->{type}. ID = $current->{id}. Level: $level. Stack depth: ".scalar(@stack)."\n";
376
} while ( ($current->{type} !~ /table|body|html/) && scalar @stack );
377
378
}
379
if ( $current->{type} eq 'table' && $tag->{type} eq 'table' ) {
380
# warn "!!!! Error: HTML validation. ID=[$current->{id}]. Stack rollback, previous table(s) will forced to be closed... !!!!\n" if $debug;
381
do {
382
$current = pop @stack;
383
$level = $current->{level};
384
# warn "New current type: /$current->{type}. ID = $current->{id}. Level: $level. Stack depth: ".scalar(@stack)."\n";
385
} while ( ($current->{type} eq "table") && scalar @stack );
386
387
}
388
$tag->{id} = $id++;
389
$tag->{children} = [];
390
$tag->{text_count} = 0;
391
$tag->{parent} = $current;
392
$tag->{level} = ++$level;
393
$elem_hash{$tag->{id}} = $tag;
394
push @stack, $current;
395
warn "Open type: $tag->{type}. ID=[$tag->{id}]. Name: ".($tag->{params}{name}||'').". Class: ".($tag->{params}{class}||'').". Level: $tag->{level}. Stack depth: ".scalar(@stack)."\n";
396
$current = $tag;
397
splice @elems, 0, $tag->{count};
398
} else {
399
# warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug;
400
last;
401
}
402
} elsif ( $elems[0] eq '<' && $elems[1] =~ /\// ) {
403
my $tag = $self->__try_end (\@elems);
404
if ( ref $tag && $tag->{type} eq 'text' ) {
405
my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef;
406
if ( ref $last_text_tag ) {
407
$last_text_tag->{text} .= $tag->{content};
408
$last_text_tag->{count} += $tag->{count};
409
} else {
410
$last_text_tag = $tag;
411
$last_text_tag->{id} = $id++;
412
$last_text_tag->{type} = 'text';
413
$last_text_tag->{parent} = $current;
414
$last_text_tag->{level} = $level+1;
415
$elem_hash{$last_text_tag->{id}} = $last_text_tag;
416
push @{$current->{children}}, $last_text_tag;
417
$current->{text_count}++;
418
}
419
$current->{text_value} += $tag->{count};
420
splice @elems, 0, $tag->{count};
421
} elsif ( ref $tag ) {
422
if ( $current->{type} ne $tag->{type} ) {
423
# warn "!!!!Wrong tag type for closing. It's [$tag->{type}]. It must be [$current->{type}]!!!!\n" if $debug;
424
# warn "Current ID: [$current->{id}]. Text place: [".substr($current->{text}, 0, 20)."]\n";
425
if ( $hierarchy{$tag->{type}} > $hierarchy{$current->{type}} ) {
426
do {
427
$current = pop @stack;
428
$level = $current->{level};
429
# warn "New current type: /$current->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n";
430
} while ( ($current->{type} ne $tag->{type}) && scalar @stack );
431
$current = pop @stack;
432
$level = $current->{level};
433
# warn "Close !the right! type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n" if $debug;
434
}else{
435
# warn "Passing by: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n" if $debug;
436
}
437
} else {
438
if ( @collaborate ) {
439
if ( defined $previous && (grep { $current->{type} eq $_->{condition} } @collaborate)
440
&& $previous->{type} eq $current->{type} && $previous->{level} == $current->{level}
441
&& $previous->{text} && $current->{text} ) {
442
$previous->{text} .= ' '.$current->{text};
443
my $parent = $current->{parent};
444
splice @{$parent->{children}}, -1, 1;
445
delete $elem_hash{$current->{id}};
446
$current = undef;
447
} elsif ( !defined $previous && $current->{text} ) {
448
$previous = $current;
449
} else {
450
$previous = undef;
451
}
452
}
453
$current = pop @stack;
454
$level = $current->{level};
455
# warn "Text place: [".substr($current->{text}, 0, 20)."]\n" if exists $current->{text};
456
# warn "Close type: /$tag->{type}. Level: $level. Stack depth: ".scalar(@stack)."\n";
457
}
458
splice @elems, 0, $tag->{count};
459
} else {
460
# warn "!!!! Error: HTML analyse. Job broken... !!!!\n" if $debug;
461
last;
462
}
463
} else {
464
my $last_text_tag = ref $current->{children} eq 'ARRAY' && @{$current->{children}} && $current->{children}->[-1]->{type} eq 'text' ? $current->{children}->[-1] : undef;
465
if ( ref $last_text_tag ) {
466
$last_text_tag->{text} .= shift @elems;
467
$last_text_tag->{count}++;
468
} else {
469
$last_text_tag->{text} = shift @elems;
470
$last_text_tag->{count} = 1;
471
$last_text_tag->{id} = $id++;
472
$last_text_tag->{type} = 'text';
473
$last_text_tag->{parent} = $current;
474
$last_text_tag->{level} = $level+1;
475
$elem_hash{$last_text_tag->{id}} = $last_text_tag;
476
push @{$current->{children}}, $last_text_tag;
477
$current->{text_count}++;
478
$current->{text_value} = 0;
479
}
480
$current->{text_value}++;
481
}
482
}
483
return (\%tree, \%elem_hash);
484
}
485
486
487
sub __try_tag {
488
my ($self, $content) = @_;
489
490
my $i = 1;
491
my %tag;
492
my $tag = $content->[0];
493
while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) {
494
$tag .= $content->[$i];
495
$i++;
496
}
497
if ( $content->[$i] eq '<' || $i >= scalar @$content ) {
498
return {
499
type => 'text',
500
content => $tag,
501
count => $i,
502
};
503
} else {
504
if ( $tag =~ /^<(div|table|tr|td|body|html)\s*(.*)/i ) {
505
my $val = $1;
506
if ( $tag =~ /^<($val)\s*(.*)/i ) {
507
$tag{type} = lc($1);
508
my $args = $2;
509
$tag{count} = $i+1;
510
my %args;
511
while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?"([^"]+)"/g ) {
512
$args{lc($1)} = $2;
513
}
514
while ( $tag =~ /([a-zA-z]+)\x20*?=\x20*?'([^']+)'/g ) {
515
$args{lc($1)} = $2;
516
}
517
while ( $tag =~ /([a-zA-z]+)=(\w+)/g ) {
518
$args{lc($1)} = $2;
519
}
520
foreach my $arg ( qw( name id class width align ) ) {
521
$tag{params}{$arg} = $args{$arg} if exists $args{$arg};
522
}
523
return \%tag;
524
} else {
525
return {
526
type => 'text',
527
content => $tag,
528
count => $i,
529
};
530
}
531
} else {
532
return {
533
type => 'text',
534
content => $tag,
535
count => $i,
536
};
537
}
538
}
539
}
540
541
sub __try_end {
542
my ($self, $content) = @_;
543
544
my $i = 2;
545
my %tag;
546
my $tag = $content->[0].$content->[1];
547
while ( $content->[$i] ne '<' && $content->[$i] ne '>' && $i < scalar @$content ) {
548
$tag .= $content->[$i];
549
$i++;
550
}
551
if ( $content->[$i] eq '<' || $i >= scalar @$content ) {
552
return {
553
type => 'text',
554
content => $tag,
555
count => $i,
556
};
557
} else {
558
if ( $tag =~ /^<\/(div|table|tr|td|body|html)/i ) {
559
my $val = $1;
560
if ( $tag =~ /^<\/($val)\s*?$/i ) {
561
$tag{type} = lc($1);
562
$tag{count} = $i+1;
563
return \%tag;
564
} else {
565
return {
566
type => 'text',
567
content => $tag,
568
count => $i,
569
};
570
}
571
} else {
572
return {
573
type => 'text',
574
content => $tag,
575
count => $i,
576
};
577
}
578
}
579
}
580
581
582
sub __extract_img {
583
my ($self, $structure, $base_url, $debug) = @_;
584
return unless ref $structure eq 'HASH';
585
586
foreach my $tag ( values %$structure ) {
587
next unless ref $tag && exists $tag->{text} && $tag->{text};
588
my $text = $tag->{text};
589
while ( $text =~ /<img (.*?)>/sgi ) {
590
# warn "Image for extract_img found [$1]. Tag ID: $tag->{id}\n";
591
my $params = $1;
592
my $img = {};
593
if ( $params =~ /src\x20*?=\x20*?["'](.*?)["']/ || $params =~ /src=([^\x20]+)/ ) {
594
$img->{url} = $1;
595
$img->{url} =~ s/[\r\t\n\x20]+$//;
596
$img->{url} =~ s/^[\r\t\n\x20]+//;
597
$img->{url} = $base_url.'/'.$img->{url} unless $img->{url} =~ /^http:/;
598
$img->{url} =~ s/\/+/\//sgi;
599
$img->{url} =~ s/http:\//http:\/\//sgi;
600
$img->{w} = $1 if $params =~ /width[\D]+(\d+)/;
601
$img->{h} = $1 if $params =~ /height[\D]+(\d+)/;
602
$img->{alt} = $1 if $params =~ /alt\x20*?=\x20*?["'](.*?)["']/;
603
$tag->{images} = [] unless ref $tag->{images} eq 'ARRAY';
604
push @{ $tag->{images} }, $img;
605
# warn "Image for extract_img stored [$img->{url}]. Tag ID: $tag->{id}\n";
606
}
607
}
608
$text =~ s/<img (.*?)>//sgi;
609
$tag->{text} = $text;
610
$tag->{count} = length ($text);
611
}
612
}
613
614
615
sub __extract_headers {
616
my ($self, $structure, $debug) = @_;
617
return unless ref $structure eq 'HASH';
618
619
foreach my $tag ( values %$structure ) {
620
next unless ref $tag && exists $tag->{text} && $tag->{text};
621
my $text = $tag->{text};
622
while ( $text =~ /<h([\d])[^>]*?>([^<]+)<\/h[\d]>/sgi ) {
623
my $header_level = $1;
624
my $header_text = $2;
625
$tag->{headers} = [] unless ref $tag->{headers} eq 'ARRAY';
626
push @{ $tag->{headers} }, { level => $header_level, text => $header_text };
627
}
628
}
629
630
}
631
632
633
sub __dig_big_texts {
634
my ($self, %opts) = @_;
635
my $structure = exists $opts{structure} ? $opts{structure} : undef;
636
my $minimum = exists $opts{min} ? $opts{min} : undef;
637
my $debug = exists $opts{debug} ? $opts{debug} : undef;
638
my $rools = exists $opts{rools} ? $opts{rools} : undef;
639
return unless ref $structure eq 'HASH';
640
641
my @rools;
642
if ( ref $rools eq 'ARRAY' && @$rools) {
643
@rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'element' } @$rools;
644
}
645
my @exclude_rools;
646
if ( ref $rools eq 'ARRAY' && @$rools) {
647
@exclude_rools = grep { $_->{command} eq 'exclude' && $_->{subcommand} eq 'element' } @$rools;
648
}
649
650
my @ret;
651
foreach my $tid ( sort { $a <=> $b } keys %$structure ) {
652
my $tag = $structure->{$tid};
653
next unless ref $tag && exists $tag->{text} && $tag->{text};
654
next if $self->__exclude_rools($tag, \@exclude_rools);
655
656
if ( @rools ) {
657
my $choose = 0;
658
foreach my $rool ( @rools ) {
659
my $matched = 1;
660
foreach my $cond ( @{$rool->{condition}} ) {
661
unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) {
662
$matched = 0;
663
}
664
}
665
$choose ||= $matched;
666
}
667
if ( $choose ) {
668
$tag->{text} =~ s/^[\t\ \n\r]+//s;
669
$tag->{text} =~ s/[\t\ \n\r]+$//s;
670
$tag->{text} =~ s/[\t\ ]+/\ /sg;
671
$tag->{text} =~ s/\r//sg;
672
$tag->{text} =~ s/\n{2,}/\n\n/sg;
673
$tag->{text} =~ s/<a.*?>//sgi;
674
$tag->{text} =~ s/<\/a.*?>//sgi;
675
$tag->{text} =~ s/\&\\x(\d+)//sgi;
676
677
my $text = $tag->{text};
678
$text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi;
679
# $text = Contenido::Parser::Util::strip_html($text);
680
if ( length($text) >= $minimum ) {
681
push @ret, $tag;
682
}
683
}
684
} else {
685
my $text = $tag->{text};
686
$text =~ s/^[\t\ \n\r]+//s;
687
$text =~ s/[\t\ \n\r]+$//s;
688
$text =~ s/[\t\ ]+/\ /sg;
689
$text =~ s/\r//sg;
690
$text =~ s/\n{2,}/\n\n/sg;
691
# It does wrong job:
692
# $text =~ s/([,!\?])(\S)/$1\ $2/sg;
693
$tag->{text} = $text;
694
$text =~ s/<a.*?href.*?<\/a[^>]*?>//sgi;
695
# $text = Contenido::Parser::Util::strip_html($text);
696
if ( length($text) >= $minimum ) {
697
$tag->{text} =~ s/<a.*?>//sgi;
698
$tag->{text} =~ s/<\/a.*?>//sgi;
699
$tag->{text} =~ s/\&\\x(\d+)//sgi;
700
push @ret, $tag;
701
}
702
}
703
}
704
unless ( @ret ) {
705
warn "Nothing was found at all!!! Check your ROOLS or MINIMUM value" if $debug;
706
}
707
return \@ret;
708
}
709
710
711
712
sub __check_headers {
713
my ($self, $chosen, $header, $debug) = @_;
714
return unless ref $chosen eq 'ARRAY';
715
716
unless ( grep { exists $_->{headers} } @$chosen ) {
717
warn "No headers found\n" if $debug;
718
return $chosen;
719
} else {
720
# @$chosen = grep { exists $_->{headers} } @$chosen;
721
}
722
my @ret;
723
foreach my $unit ( @$chosen ) {
724
unless ( exists $unit->{headers} && ref $unit->{headers} eq 'ARRAY' ) {
725
$unit->{header_identity} = 0;
726
$unit->{header_min_level} = 32768;
727
next;
728
}
729
my @headers = sort { $a->{level} <=> $b->{level} } @{$unit->{headers}};
730
my $min_level = $headers[0]->{level};
731
$unit->{header_min_level} = $min_level;
732
if ( $header ) {
733
my $coeff = $self->__str_compare( $header, $headers[0]->{text} );
734
$unit->{header_identity} = $coeff;
735
}
736
}
737
# @ret = sort { $a->{header_min_level} <=> $b->{header_min_level} } @$chosen;
738
# return \@ret;
739
return $chosen;
740
}
741
742
743
744
sub __check_description {
745
my ($self, $chosen, $desc, $debug) = @_;
746
return unless ref $chosen eq 'ARRAY' && $desc;
747
748
my @ret;
749
foreach my $unit ( @$chosen ) {
750
if ( $desc ) {
751
my $coeff = $self->__str_compare( $unit->{text}, $desc );
752
warn "Coeff: [$coeff] to: [$unit->{text}]\n" if $debug;
753
$unit->{description_identity} = $coeff;
754
}
755
}
756
@ret = sort { $b->{description_identity} <=> $a->{description_identity} } grep { $_->{description_identity} > -0.9 } @$chosen;
757
return \@ret;
758
}
759
760
761
# wtf, bastards! how come my code's used here? --ra
762
# damn, it's not 100% your code already
763
sub __str_compare {
764
my ($self, $original, $applicant) = @_;
765
766
my $Al = __freq_list($original);
767
return -1 unless defined $Al;
768
my $Bl = __freq_list($applicant);
769
return -1 unless defined $Bl;
770
my $df = 0;
771
772
foreach my $word ( %$Bl ) {
773
if ( exists $Al->{$word} ) {
774
$df += $Al->{$word}
775
} else {
776
$df -= $Bl->{$word}
777
}
778
}
779
780
return $df;
781
}
782
783
sub __freq_list {
784
785
my @d = grep { length($_) > 3 } split /\W/, $_[0];
786
return undef unless @d;
787
my $z = 1/scalar(@d); my %l = ();
788
$l{$_} += $z for @d; \%l;
789
}
790
791
792
793
sub __strip_html {
794
my ($self, %opts) = @_;
795
return unless ref $opts{chosen} eq 'ARRAY';
796
797
my $chosen = $opts{chosen};
798
my $rooles = $opts{rools};
799
my $header = $opts{header};
800
801
foreach my $unit ( @$chosen ) {
802
my %tags;
803
my $headers = $unit->{headers} if exists $unit->{headers};
804
if ( ref $headers && ref $rooles eq 'ARRAY' && grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' } @$rooles ) {
805
if ( grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' && $_->{condition}{value} eq 'all' } @$rooles ) {
806
$unit->{text} =~ s/<h(\d)[^>]*>(.*?)<\/h(\d)[^>]*>/\n/sgi;
807
$unit->{text} =~ s/^[\x20\t\r\n]+//si;
808
} elsif ( grep { $_->{command} eq 'kill' && $_->{condition}{param} eq 'headers' && $_->{condition}{value} eq 'leading' } @$rooles ) {
809
while ( $unit->{text} =~ /^<h(\d)[^>]*>(.*?)<\/h(\d)[^>]*>/si ) {
810
my $hdr = 'h'.$1;
811
$unit->{text} =~ s/^<$hdr[^>]*>(.*?)<\/$hdr[^>]*>//si;
812
}
813
}
814
}
815
for ( $unit->{text} ) {
816
s/></> </sg;
817
s/([\!\?:.])\s*?<\/h(\d+)(.*?)>/$1 /sgi;
818
s/<\/h(\d+)(.*?)>/\. /sgi;
819
s/<h(\d+)(.*?)>/\n/sgi;
820
s/&/\&/sg;
821
s/&/\&/sgi;
822
s/«/«/sg;
823
s/»/»/sg;
824
s/£/£/sg;
825
s/–/–/sg;
826
s/—/—/sg;
827
s/…/\.\.\./sg;
828
s/„/"/sg;
829
s/“/"/sg;
830
s/”/"/sg;
831
s/´/'/sg;
832
s/ /\n/sg;
833
s/"/"/sg;
834
s/ /\ /sgi;
835
}
836
# $unit->{text} = HTML::Entities::decode_entities($unit->{text});
837
# $unit->{text} = Contenido::Parser::Util::strip_html($unit->{text});
838
for ( $unit->{text} ) {
839
s/^[\x20\t\r\n]+//si;
840
s/^(\d+)\.(\d+)\.(\d+)//si;
841
s/^[\x20\t\r\n]+//si;
842
s/^(\d+):(\d+)//si;
843
s/^[\x20\t\r\n]+//si;
844
}
845
if ( lc(substr ($unit->{text}, 0, length($header) )) eq lc($header) ) {
846
substr $unit->{text}, 0, length($header), '';
847
$unit->{text} =~ s/^[\.\x20\t\r\n]+//sgi;
848
}
849
$unit->{text} =~ s/[\x20\t\r\n]+$//sgi;
850
}
851
}
852
853
854
855
sub __glue {
856
my ($self, $chosen, $glue, $debug) = @_;
857
return unless ref $chosen eq 'ARRAY';
858
859
my $i = 0;
860
861
if ( $glue->{subcommand} eq 'first' || $glue->{subcommand} eq 'all' ) {
862
my $count = exists $glue->{subcommand} && $glue->{subcommand} eq 'first' ? $glue->{condition} : 32768;
863
foreach my $unit ( @$chosen ) {
864
next unless $i++;
865
if ( $i <= $count ) {
866
$chosen->[0]->{text} .= "\n\n".$chosen->[$i-1]->{text};
867
}
868
}
869
} elsif ( $glue->{subcommand} eq 'order' && ref $glue->{condition} eq 'ARRAY' ) {
870
my $text = '';
871
my $i = 0;
872
foreach my $pos ( @{ $glue->{condition} } ) {
873
$text .= ($i++ ? "\n\n" : '').$chosen->[$pos-1]->{text};
874
}
875
$chosen->[0]->{text} = $text;
876
}
877
}
878
879
880
sub __get_images {
881
my ($self, %opts) = @_;
882
my $structure = exists $opts{structure} ? $opts{structure} : undef;
883
my $chosen = exists $opts{chosen} ? $opts{chosen} : undef;
884
my $debug = exists $opts{debug} ? $opts{debug} : undef;
885
my $rools = exists $opts{rools} ? $opts{rools} : undef;
886
my $base_url = delete $opts{base_url};
887
return unless ref $chosen && ref $structure;
888
889
return if ref $rools eq 'ARRAY' && grep { $_->{command} eq 'image_off' } @$rools;
890
my @use_rools;
891
my @exclude_rools;
892
my $no_validation = 0;
893
if ( ref $rools eq 'ARRAY' && @$rools) {
894
@use_rools = grep { $_->{command} eq 'use' && $_->{subcommand} eq 'image' } @$rools;
895
@exclude_rools = grep { $_->{command} eq 'exclude' && $_->{subcommand} eq 'image' } @$rools;
896
$no_validation = grep { $_->{command} eq 'dont' && $_->{subcommand} eq 'validate' && $_->{condition}{param} eq 'image' } @$rools;
897
}
898
my $image_depth;
899
if ( ref $rools eq 'ARRAY' && @$rools) {
900
my @rools = grep { $_->{command} eq 'image' && $_->{subcommand} eq 'depth' } @$rools;
901
$image_depth = $rools[-1]->{condition} if @rools;
902
}
903
904
my @images;
905
foreach my $tag ( values %$structure ) {
906
next unless exists $tag->{images} && ref $tag->{images} eq 'ARRAY';
907
next if $self->__exclude_rools($tag, \@exclude_rools);
908
909
if ( @use_rools ) {
910
my $choose = 0;
911
foreach my $rool ( @use_rools ) {
912
my $matched = 1;
913
foreach my $cond ( @{$rool->{condition}} ) {
914
unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) {
915
$matched = 0;
916
}
917
}
918
$matched = 0 if $self->__exclude_rools($tag, \@exclude_rools);
919
$choose ||= $matched;
920
}
921
if ( $choose ) {
922
my @img = grep { $no_validation || $self->__img_is_valid ($_) } map {
923
my $img = rchannel::Image->new($_);
924
$img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/;
925
$img;
926
} map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} };
927
928
push @images, @img;
929
}
930
} else {
931
next if ($tag->{level}+1) < $chosen->{level};
932
next if $image_depth && ( $tag->{level} > ($chosen->{level} + $image_depth) );
933
934
my $ok = 0;
935
my $uphops = $tag->{level} > $chosen->{level} ? 1 : 2;
936
my $hops = $image_depth ? $image_depth : $tag->{level} - $chosen->{level} + $uphops;
937
next if ($hops - $uphops) > 4;
938
my @img_parents = ($tag->{id});
939
my $parent = $tag;
940
for ( 1..$hops ) {
941
$parent = $parent->{parent};
942
push @img_parents, $parent->{id};
943
}
944
$parent = $chosen;;
945
for ( 0..$uphops ) {
946
if ( grep { $parent->{id} == $_ } @img_parents ) {
947
$ok = 1;
948
last;
949
}
950
$parent = $parent->{parent};
951
}
952
if ( $ok ) {
953
my @img = grep { $self->__img_is_valid ($_) } map {
954
my $img = rchannel::Image->new($_);
955
$img->src($base_url.($img->src =~ m|^/| ? '' : '/').$img->src) unless $img->src =~ /^http:/;
956
$img;
957
} map { {src => $_->{url}, width => $_->{w}, height => $_->{h}, alt => $_->{alt}, title => $_->{alt}} } @{ $tag->{images} };
958
959
push @images, @img;
960
}
961
}
962
}
963
# warn Dumper (\@images);
964
if ( @images ) {
965
return \@images;
966
} else {
967
return undef;
968
}
969
}
970
971
972
sub __img_is_valid {
973
my ($self, $img) = @_;
974
975
# return 1;
976
if ( $img->check_online ) {
977
my $delim = 0;
978
my $w = $img->width;
979
my $h = $img->height;
980
if ( $w && $h ) {
981
foreach my $pair ( @bad_dimensions ) {
982
if ($w == $pair->{w} && $h == $pair->{h}) {
983
return undef;
984
}
985
}
986
$delim = ( $w >= $h ? $w : $h ) / ( $w >= $h ? $h : $w ) unless $delim;
987
if ( $w < 80 || $h < 80 || $delim > 2.5 ) {
988
return undef;
989
}
990
}
991
} else {
992
# warn "Image ".$img->src." not found on server";
993
return undef;
994
}
995
return 1;
996
}
997
998
999
sub __exclude_rools {
1000
my ($self, $tag, $rools) = @_;
1001
return undef unless ref $rools eq 'ARRAY' && @$rools;
1002
1003
my $choose = 0;
1004
foreach my $rool ( @$rools ) {
1005
my $matched = 1;
1006
foreach my $cond ( @{$rool->{condition}} ) {
1007
unless ( exists $tag->{params}{$cond->{param}} && $tag->{params}{$cond->{param}} eq $cond->{value} ) {
1008
$matched = 0;
1009
}
1010
}
1011
$choose ||= $matched;
1012
}
1013
return $choose;
1014
}
1015
1016
1017
sub __parse_rools {
1018
my ($self, $rools) = @_;
1019
return unless $rools;
1020
$rools =~ s/\r//sgi;
1021
my @rools = split /\n/, $rools;
1022
return unless @rools;
1023
1024
my @parsed;
1025
foreach my $rool ( @rools ) {
1026
my %pr;
1027
next if $rool =~ /^#/;
1028
$rool =~ s/[\x20\t]+$//;
1029
$rool =~ s/^[\x20\t]+//;
1030
if ( $rool =~ /^([\w']+)\s+(.*)$/ || $rool =~ /^(\w+)(.*)$/ ) {
1031
$pr{command} = lc($1);
1032
my $params = $2;
1033
1034
if ( $pr{command} eq 'cut' && $params =~ /^(\w+)\s+(.*)$/ ) {
1035
$pr{subcommand} = lc($1); $params = $2;
1036
next unless $pr{subcommand} =~ /^(untill|till|from|off|regex|to)$/;
1037
$params =~ s|([*+?/\\\|])|\\$1|sg unless $pr{subcommand} eq 'regex';
1038
$pr{condition} = $params;
1039
} elsif ( $pr{command} eq 'glue' ) {
1040
if ( $params =~ /^(\w+)\s+(.*)$/ ) {
1041
$pr{subcommand} = $1; $params = $2;
1042
next unless $pr{subcommand} =~ /^(first|all|order)$/;
1043
if ( $pr{subcommand} eq 'order' ) {
1044
my @pars = grep { $_ } map { int($_) } split (/\s*,\s*/,$params);
1045
$pr{condition} = \@pars;
1046
} else {
1047
$pr{condition} = int($1);
1048
}
1049
} elsif ( $params =~ /(\d+)/i ) {
1050
$pr{subcommand} = 'first';
1051
$pr{condition} = int($1);
1052
} else {
1053
$pr{subcommand} = 'all';
1054
}
1055
} elsif ( $pr{command} eq 'trim' ) {
1056
if ( $params =~ /(left|right)/i ) {
1057
$pr{subcommand} = lc($1);
1058
} else {
1059
$pr{subcommand} = 'all';
1060
}
1061
} elsif ( $pr{command} eq 'collaborate' && $params =~ /^(div|td)/i ) {
1062
$pr{condition} = $1;
1063
} elsif ( $pr{command} eq 'image' && $params =~ /^off$/i ) {
1064
$pr{command} = 'image_off';
1065
} elsif ( $pr{command} eq 'image' && $params =~ /^(\w+)\s+(.*)$/si ) {
1066
$pr{subcommand} = lc($1); $params = $2;
1067
next unless $pr{subcommand} =~ /^(depth)$/;
1068
$pr{condition} = $params;
1069
} elsif ( $pr{command} eq 'set' ) {
1070
if ( $params =~ /^(limit)\s+(.*)$/si ) {
1071
$pr{subcommand} = lc($1);
1072
$params = $2;
1073
}
1074
if ( $params =~ /^(\w+)\s+(.*)$/ ) {
1075
$pr{condition} = { param => $1, value => $2 };
1076
} else {
1077
next;
1078
}
1079
} elsif ( $pr{command} eq 'kill' && $params =~ /^(leading|all)\s+(headers)$/ ) {
1080
$pr{command} = 'kill';
1081
$pr{condition} = { param => $2, value => $1 };
1082
} elsif ( $pr{command} eq 'use' && $params =~ /^(title)\s+(as)\s+(description)$/ ) {
1083
$pr{command} = 'set';
1084
$pr{condition} = { param => 'description', value => 'header' };
1085
} elsif ( $pr{command} eq 'use' && $params =~ /^(\w+)\s+(.*)$/ ) {
1086
$pr{subcommand} = $1; $params = $2;
1087
next unless $pr{subcommand} =~ /^(element|elem|image)$/;
1088
$pr{subcommand} = 'element' if $pr{subcommand} =~ /^(element|elem)$/;
1089
my @conditions;
1090
while ( $params =~ /(class|id|name|align)\x20*=\x20*"([^"]+)"/sgi ) {
1091
push @conditions, { param => lc($1), value => $2 }
1092
}
1093
$pr{condition} = \@conditions;
1094
} elsif ( $pr{command} eq 'exclude' && $params =~ /^(\w+)\s+(.*)$/ ) {
1095
$pr{subcommand} = lc($1); $params = $2;
1096
next unless $pr{subcommand} =~ /^(image|elem|element)$/;
1097
$pr{subcommand} = 'element' if $pr{subcommand} =~ /^(element|elem)$/;
1098
my @conditions;
1099
while ( $params =~ /(class|id|name|align)\x20*=\x20*"([^"]+)"/sgi ) {
1100
push @conditions, { param => lc($1), value => $2 }
1101
}
1102
$pr{condition} = \@conditions;
1103
} elsif ( ($pr{command} eq 'dont' || $pr{command} eq "don't") && $params =~ /^(\w+)\s+(.*)$/ ) {
1104
$pr{command} = 'dont';
1105
$pr{subcommand} = lc($1); $params = $2;
1106
next unless $pr{subcommand} =~ /^(cut|validate)$/;
1107
my @conditions;
1108
if ( $params =~ /(tag)\x20*=\x20*"([^"]+)"/sgi ) {
1109
$pr{condition} = { param => lc($1), value => $2 };
1110
} elsif ( $params =~ /(image)/i ) {
1111
$pr{condition} = { param => lc($1) };
1112
} else {
1113
next;
1114
}
1115
} elsif ( $pr{command} eq 'clean' ) {
1116
if ( $params =~ /^(off)\s+(.*)$/ ) {
1117
$pr{subcommand} = lc($1); $params = $2;
1118
}
1119
if ( $params =~ /(tag)\x20*=\x20*"([^"]+)"/sgi ) {
1120
$pr{condition} = { param => lc($1), value => $2 };
1121
} elsif ( $params =~ /(span)/i ) {
1122
$pr{condition} = { param => 'tag', value => lc($1) };
1123
} else {
1124
next;
1125
}
1126
} else {
1127
next;
1128
}
1129
push @parsed, \%pr;
1130
}
1131
}
1132
return ( scalar @parsed ? \@parsed : undef );
1133
}
1134
1135
1136
sub __post_rool {
1137
my ($self, $element, $rools, $description) = @_;
1138
1139
if ( ref $rools eq 'ARRAY' && @$rools ) {
1140
foreach my $rool ( @$rools ) {
1141
if ( $rool->{command} eq 'cut' ) {
1142
my $condition = $rool->{condition};
1143
if ( $rool->{subcommand} eq 'off' ) {
1144
$element->{text} =~ s/$condition//sgi;
1145
} elsif ( $rool->{subcommand} eq 'from' ) {
1146
my $pos = index $element->{text}, $condition;
1147
if ( $pos >= 0 ) {
1148
$element->{text} = substr $element->{text}, 0, $pos;
1149
}
1150
# $element->{text} =~ s/$condition(.*)$//si;
1151
} elsif ( $rool->{subcommand} eq 'to' && $condition eq 'description' && $description ) {
1152
my $str = substr $description, 0, 12;
1153
my $pos = index $element->{text}, $str;
1154
if ( $pos >= 0 ) {
1155
$element->{text} = substr $element->{text}, $pos, -1;
1156
}
1157
} elsif ( $rool->{subcommand} eq 'till' ) {
1158
$element->{text} =~ s/^(.*?)($condition)//si;
1159
} elsif ( $rool->{subcommand} eq 'untill' ) {
1160
$element->{text} =~ s/^(.*?)($condition)/$2/si;
1161
} elsif ( $rool->{subcommand} eq 'regex' ) {
1162
$element->{text} =~ s/$condition//sgi;
1163
}
1164
} elsif ( $rool->{command} eq 'trim' ) {
1165
if ( $rool->{subcommand} eq 'left' ) {
1166
$element->{text} =~ s/^[\x20\xA0\t\n\r]+//sg;
1167
} elsif ( $rool->{subcommand} eq 'right' ) {
1168
$element->{text} =~ s/[\x20\xA0\t\n\r]+$//sg;
1169
} else {
1170
$element->{text} =~ s/^[\x20\xA0\t\n\r]+//sg;
1171
$element->{text} =~ s/[\x20\xA0\t\n\r]+$//sg;
1172
}
1173
}
1174
}
1175
}
1176
}
1177
1178
1179
1;
Небольшая справка по веткам
cnddist – контейнер, в котором хранятся все дистрибутивы всех библиотек и программных пакетов, которые использовались при построении различных версий Contenido. Если какой-то библиотеки в данном хранилище нет, инсталлятор сделает попытку "подтянуть" ее с веба (например, с CPAN). Если библиотека слишком старая, есть очень большая вероятность, что ее там уже нет. Поэтому мы храним весь хлам от всех сборок. Если какой-то дистрибутив вдруг отсутствует в cnddist - напишите нам, мы положим его туда.
koi8 – отмирающая ветка, чей код, выдача и все внутренние библиотеки заточены на кодировку KOI8-R. Вносятся только те дополнения, которые касаются внешнего вида и функционала админки, баги ядра, обязательные обновления портов и мелочи, которые легко скопипастить. В дальнейшем планируется полная остановка поддержки по данной ветке.
utf8 – актуальная ветка, заточенная под UTF-8.
Внутри каждой ветки: core – исходники ядра; install – скрипт установки инсталляции; plugins – плагины; samples – "готовые к употреблению" проекты, которые можно поставить, запустить и посмотреть, как они работают.