#!/usr/bin/perl -w # Copyright (c) 2003 Philip S Tellis # Licenced under the GPL # $Id: parse-content.pl,v 1.2 2003/12/17 13:54:31 bluesmoon Exp $ use strict; use HTML::Parser; use Text::Wrap; $Text::Wrap::columns=72; my $revision = '$Revision: 1.2 $'; my $site = shift || die "No type"; my $hide_ignored = 1; # defaults my $container_tags = '^table$'; my $link_lo = 1; my $link_hi = 2; my $test_link_popularity = 0; my $content_threshold = 40; my $base_href = ""; if($site eq 'indian express') { $container_tags = '^table$'; $link_lo = 1; $link_hi = 2; $test_link_popularity = 1; $content_threshold = 115; $base_href = "http://www.indianexpress.com/"; } elsif($site eq 'financial express') { $container_tags = '^table$'; $link_lo = 1; $link_hi = 3; $test_link_popularity = 1; $content_threshold = 119; $base_href = "http://www.financialexpress.com/"; } elsif($site eq 'hindu') { $container_tags = '^(p|td|hr)$'; $link_lo = 1; $link_hi = 1; $test_link_popularity = 1; $content_threshold = 103; $base_href = "http://www.hinduonnet.com/"; } elsif($site eq 'hindustan times') { $container_tags = '^td$'; $link_lo = 1; $link_hi = 1; $test_link_popularity = 1; $content_threshold = 80; $base_href = "http://www.hindustantimes.com/"; } elsif($site eq 'businessline') { $container_tags = '^(p|hr)$'; $link_lo = 1; $link_hi = 1; $test_link_popularity = 1; $content_threshold = 117; $base_href = "http://www.thehindubusinessline.com/"; } sub mid_filter($); sub post_filter($); sub filter_content($); sub make_rss($); my $p = HTML::Parser->new(api_version => 3); my @containers = (); my @links = (); my %links = (); my $in_container=0; my $avg_content_length=0; my $ncontent=0; my @ignore_text = ( ']*>(<[^>]*>)?\s*full\s+story\s*(<[^>]*>)?', ']*>(<[^>]*>)?\s*More(?: in [\w\s]+)?(<[^>]*>)?', ']*>(?:Related Stories|Pictures|Full Coverage|Special Links)(?:<[^>]*>)*?', '.*?', '.*?', '<([^a]\w*)[^>]*>[^\w<]*', ']*>' ); $p->handler( comment => ""); $p->handler( start => \&start_handler, "tagname, self, text, attr"); $p->handler( end => \&end_handler, "tagname"); $p->ignore_tags(qw(tbody div)); $p->ignore_elements(qw(script style noscript)); $p->parse_file(shift || die "No input") || die $!; @containers = map filter_content($_), @containers; print "\n"; if($test_link_popularity) { print "\n"; for my $link (@links) { $link =~ s#http://[\w.]+##; if($link =~ /\?/) { $link =~ s#\?.+##; } elsif($link =~ m#/[\d_,]+\.s?html?$#) { $link =~ s#/[\d_,]+\.s?html?$#/#; } next if $link =~ m#^/?$#; $links{$link}++; } for my $link (keys %links) { if($links{$link} < $ncontent/2) { delete $links{$link}; } } for my $link (sort {$links{$b} <=> $links{$a}} keys %links) { print "\n"; } print "\n"; } for my $container (@containers) { next if $container->{'ignore'} && $hide_ignored; print "\n\n"; while($test_link_popularity && $container->{'content'} =~ /({'content'} =~ s#$pr$l2$po[^>]*>[^<]*##i; } } print post_filter make_rss mid_filter $container->{'content'}; } my $skip_next_a = 0; sub end_handler { return unless $in_container; my $tag = shift; $in_container-- if $tag =~ /$container_tags/; if($tag eq 'a' && $skip_next_a) { $skip_next_a = 0; return; } $containers[$#containers] .= ""; } sub start_handler { my $tag = shift; return if $tag !~ /$container_tags/ && !$in_container; my $self = shift; my $text = shift; my $attr = shift; if($tag =~ /$container_tags/) { $in_container++; $self->handler( text => sub { return if $skip_next_a; $containers[$#containers] .= shift if $#containers >= 0; }, "text" ); push @containers, ""; } # if($tag eq 'a' && (!$attr->{'href'} || $attr->{'href'} =~ /^(#|javascript)/)) { # $skip_next_a=1; # return; # } $containers[$#containers] .= $text; } sub filter_content($) { my $content = shift; my $ignore = 0; my $c2=""; my $link_index=0; $content =~ s/ / /g; foreach my $i (@ignore_text) { $content =~ s/$i//sig; } if($content !~ />[^<]*\w+[^<]*.*<#><#s; $ignore = 1; } else { $c2 = $content; # $c2 =~ s#]+>[^<]+##gs; while($c2 =~ /]+>//g; $c2 =~ s/\W//g; if(length $c2 < $content_threshold || $link_index < $link_lo || $link_index > $link_hi || length $c2 < ($avg_content_length * ( 0.8 - 0.2 * $ncontent))) { $ignore = 1; } else { $avg_content_length = (($avg_content_length * $ncontent) + length($c2))/($ncontent+1); $ncontent++; } } unless ($ignore) { $content =~ s#]*>##sgi; } { 'content-index' => length($c2), 'link-index' => $link_index, 'ignore' => $ignore, 'content' => $content }; } sub make_rss($) { my $content = shift; $content =~ s{ ]+?href=['"]?([^'"> ]+)['"]?[^>]*>(.+?)(.*) } { my ($link, $title, $desc) = ($1, $2, $3); $desc =~ s#.+?##sig; $desc =~ s/^\s+//s; $desc =~ s/\s+$//s; $link =~ s#^/##; $link = "$base_href$link" if($link !~ /^http/); $title =~ s/^\s+//s; $title =~ s/\s+$//s; " $title $link " . Text::Wrap::wrap(" ", " ", $desc) . " "; }sexi; return $content; } sub post_filter($) { local $_ = shift; s/.*/ /s; s#.*##s; return $_; } sub mid_filter($) { local $_ = shift; s/(\x91|\x92)/'/g; s/(\x93|\x94)/"/g; s/\x95//g; return $_; }