#!/usr/bin/perl -w
# Copyright (c) 2003 Philip S Tellis
# Licenced under the GPL
# $Id: parse-content.pl,v 1.2 2003/12/17 13:54:31 bluesmoon Exp $
use strict;
use HTML::Parser;
use Text::Wrap;
$Text::Wrap::columns=72;
my $revision = '$Revision: 1.2 $';
my $site = shift || die "No type";
my $hide_ignored = 1;
# defaults
my $container_tags = '^table$';
my $link_lo = 1;
my $link_hi = 2;
my $test_link_popularity = 0;
my $content_threshold = 40;
my $base_href = "";
if($site eq 'indian express') {
$container_tags = '^table$';
$link_lo = 1;
$link_hi = 2;
$test_link_popularity = 1;
$content_threshold = 115;
$base_href = "http://www.indianexpress.com/";
} elsif($site eq 'financial express') {
$container_tags = '^table$';
$link_lo = 1;
$link_hi = 3;
$test_link_popularity = 1;
$content_threshold = 119;
$base_href = "http://www.financialexpress.com/";
} elsif($site eq 'hindu') {
$container_tags = '^(p|td|hr)$';
$link_lo = 1;
$link_hi = 1;
$test_link_popularity = 1;
$content_threshold = 103;
$base_href = "http://www.hinduonnet.com/";
} elsif($site eq 'hindustan times') {
$container_tags = '^td$';
$link_lo = 1;
$link_hi = 1;
$test_link_popularity = 1;
$content_threshold = 80;
$base_href = "http://www.hindustantimes.com/";
} elsif($site eq 'businessline') {
$container_tags = '^(p|hr)$';
$link_lo = 1;
$link_hi = 1;
$test_link_popularity = 1;
$content_threshold = 117;
$base_href = "http://www.thehindubusinessline.com/";
}
sub mid_filter($);
sub post_filter($);
sub filter_content($);
sub make_rss($);
my $p = HTML::Parser->new(api_version => 3);
my @containers = ();
my @links = ();
my %links = ();
my $in_container=0;
my $avg_content_length=0;
my $ncontent=0;
my @ignore_text = (
']*>(<[^>]*>)?\s*full\s+story\s*(<[^>]*>)?',
']*>(<[^>]*>)?\s*More(?: in [\w\s]+)?(<[^>]*>)?',
']*>(?:Related Stories|Pictures|Full Coverage|Special Links)(?:<[^>]*>)*?',
'.*?',
'.*?',
'<([^a]\w*)[^>]*>[^\w<]*\1>',
'
]*>'
);
$p->handler( comment => "");
$p->handler( start => \&start_handler, "tagname, self, text, attr");
$p->handler( end => \&end_handler, "tagname");
$p->ignore_tags(qw(tbody div));
$p->ignore_elements(qw(script style noscript));
$p->parse_file(shift || die "No input") || die $!;
@containers = map filter_content($_), @containers;
print "\n";
if($test_link_popularity) {
print "\n";
for my $link (@links) {
$link =~ s#http://[\w.]+##;
if($link =~ /\?/) {
$link =~ s#\?.+##;
} elsif($link =~ m#/[\d_,]+\.s?html?$#) {
$link =~ s#/[\d_,]+\.s?html?$#/#;
}
next if $link =~ m#^/?$#;
$links{$link}++;
}
for my $link (keys %links) {
if($links{$link} < $ncontent/2) {
delete $links{$link};
}
}
for my $link (sort {$links{$b} <=> $links{$a}} keys %links) {
print "\n";
}
print "\n";
}
for my $container (@containers) {
next if $container->{'ignore'} && $hide_ignored;
print "\n\n";
while($test_link_popularity &&
$container->{'content'} =~
/({'content'} =~ s#$pr$l2$po[^>]*>[^<]*##i;
}
}
print post_filter make_rss mid_filter $container->{'content'};
}
my $skip_next_a = 0;
sub end_handler
{
return unless $in_container;
my $tag = shift;
$in_container-- if $tag =~ /$container_tags/;
if($tag eq 'a' && $skip_next_a) {
$skip_next_a = 0;
return;
}
$containers[$#containers] .= "$tag>";
}
sub start_handler
{
my $tag = shift;
return if $tag !~ /$container_tags/ && !$in_container;
my $self = shift;
my $text = shift;
my $attr = shift;
if($tag =~ /$container_tags/) {
$in_container++;
$self->handler( text => sub {
return if $skip_next_a;
$containers[$#containers] .= shift
if $#containers >= 0;
}, "text" );
push @containers, "";
}
# if($tag eq 'a' && (!$attr->{'href'} || $attr->{'href'} =~ /^(#|javascript)/)) {
# $skip_next_a=1;
# return;
# }
$containers[$#containers] .= $text;
}
sub filter_content($)
{
my $content = shift;
my $ignore = 0;
my $c2="";
my $link_index=0;
$content =~ s/ / /g;
foreach my $i (@ignore_text) {
$content =~ s/$i//sig;
}
if($content !~ />[^<]*\w+[^<]*.*<#><#s;
$ignore = 1;
} else {
$c2 = $content;
# $c2 =~ s#]+>[^<]+##gs;
while($c2 =~ /]+>//g;
$c2 =~ s/\W//g;
if(length $c2 < $content_threshold
|| $link_index < $link_lo || $link_index > $link_hi
|| length $c2 < ($avg_content_length * ( 0.8 - 0.2 * $ncontent))) {
$ignore = 1;
} else {
$avg_content_length = (($avg_content_length * $ncontent) + length($c2))/($ncontent+1);
$ncontent++;
}
}
unless ($ignore) {
$content =~ s#?[^a/][^>]*>##sgi;
}
{
'content-index' => length($c2),
'link-index' => $link_index,
'ignore' => $ignore,
'content' => $content
};
}
sub make_rss($)
{
my $content = shift;
$content =~
s{
]+?href=['"]?([^'"> ]+)['"]?[^>]*>(.+?)(.*)
} {
my ($link, $title, $desc) = ($1, $2, $3);
$desc =~ s#.+?##sig;
$desc =~ s/^\s+//s;
$desc =~ s/\s+$//s;
$link =~ s#^/##;
$link = "$base_href$link" if($link !~ /^http/);
$title =~ s/^\s+//s;
$title =~ s/\s+$//s;
"
-
$title
$link
" . Text::Wrap::wrap(" ", " ", $desc) . "
";
}sexi;
return $content;
}
sub post_filter($)
{
local $_ = shift;
s/.*- /
- /s;
s#
.*# #s;
return $_;
}
sub mid_filter($)
{
local $_ = shift;
s/(\x91|\x92)/'/g;
s/(\x93|\x94)/"/g;
s/\x95//g;
return $_;
}