#! /usr/bin/perl
#use utf8;
use XML::Parser::Lite;
use Socket;
use Encode qw(:all);
#use threads;
#use threads::shared;

alarm(9.5*60);

$url='http://news.eu.by/';
$subject='Belarus';
$root=substr($0,0,rindex($0,'/'));
$pub="$root/dir";
$web="$root/html";
$temp="$root/tmp";
$xmlname="newz";
$deltaname="newz";
$htmlname="newz";
#my $enc='.gz';
$enc='';
$title='Breaking News!';
$rotate_time=5400;  # sec;
$proxy='195.50.2.154:8080';
@or=('Belarus','Belorussia','Byelorussia','Belarussian','Byelorussian','Bielorussia','Bielorusso','Bielorussa','Belarusse');
$|=1;

do "$root/fin";

my $encoding='windows-1251';

my $fail=0;

my $heads=qq(<title>$title</title>
<meta http-equiv="Content-Type" content="text/html; charset=$encoding">
<link rel="StyleSheet" type="text/css" href="/newz.css">
<link rel="icon" href="n1.ico">
);

my $ad1=q(
<script type="text/javascript"><!--
google_ad_client = "pub-1438492991159293";
google_ad_width = 728;
google_ad_height = 15;
google_ad_format = "728x15_0ads_al_s";
//2006-09-27: newz
google_ad_channel ="1395501299";
google_color_border = "CCCCCC";
google_color_bg = "CCCCCC";
google_color_link = "000000";
google_color_text = "333333";
google_color_url = "666666";
//--></script>
<script type="text/javascript"
  src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
);

my $ad2=q(
<script type="text/javascript"><!--
google_ad_client = "pub-1438492991159293";
google_ad_width = 728;
google_ad_height = 90;
google_ad_format = "728x90_as";
google_ad_type = "text";
//2006-09-27: newz
google_ad_channel ="1395501299";
//--></script>
<script type="text/javascript"
  src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
);

my %cont:shared=(
'mahatma.bspu.unibel.by'=>'Dzianis Kahanovich',
'www.bspu.unibel.by'=>'Belarussian State Pedagogical University',
'www.belta.by'=>'BELTA',
'news.google.com'=>'Google News',
'finance.google.com'=>'Google Finance'
);

my $or_='+OR+';
#my $or_='+%7C+';

my %goo=(
''=>{q=>join($or_,@or)},
'de'=>{ned=>'tde',q=>join($or_,('Wei%C3%9Frussland','Belarus','Belorussland','Wei%C3%9Fru%C3%9Fland','Beloru%C3%9Fland','wei%C3%9Frussisch','belarussich','wei%C3%9Fruthenisch','Belarusse','Wei%C3%9Frusse','Belarussin','Wei%C3%9Frussin','Wei%C3%9Frussische'))},
'nl_nl'=>{ned=>'tnl_nl',q=>join($or_,('Wit-Rusland','Wit-Russisch','Wit-Rus','Wit-Russin','Wit-Russische','Bjelo-Rusland','Bjelorussisch','Bjelorussische','Bjelorus','Belarus'))},
'fr'=>{ned=>'tfr',q=>join($or_,('Belarus','Bi%C3%A9lorussie','Bi%C3%A9lorusse','Bielaroussy','Bielarouss','Bi%C3%A9larussie','Bi%C3%A9larusse'))},
'pt-PT_pt'=>{ned=>'tpt-PT_pt',q=>join($or_,('Bielo-R%C3%BAssia','Bielor%C3%BAssia','Bielorr%C3%BAssia'))}
);

my $ya=join('%7C','%C1%E5%EB%E0%F0%F3%F1%FC','%C1%E5%EB%E0%F0%F3%F1%FC','%C1%E5%EB%EE%F0%F3%F1%F1%EA%E8%E9');

my @lang=('en','es','fr','de','nl_nl','it','pt-PT_pt','pt-BR_br');
my %lang_web=('en'=>2,'ru'=>1,'fr'=>2,'de'=>2,'nl_nl'=>2,'it'=>2,'pt-PT_pt'=>2,'pt-BR_br'=>2);

my $results=0;

sub fn{
substr($_[1],-3) eq '.gz'?($_[0]?"|gzip -9 >$_[1]":"gzip -dc <$_[1]|"):($_[0]?">$_[1]":"<$_[1]");
}

sub add_cont{
my $l=$_[0].'/';
my $x;
$l=~s/http\:\/\/(.*?)[\:\/]/$x=$1;''/gsei;
$cont{$x}||=$_[1];
}

sub cur_cont{
my ($i,$r);
for(sort keys %cont){
 $i=unesc($_);
 $r.="<li>".url("http://$i",$cont{$_}) if(index($_[0],$_)>=0 || index($_[0],$i)>=0 || $_ ne $cont{$_});
}
$r
}

sub esc{
my $x=shift;
$x=~s/([\x00-\x1f,:\"\'\\\/])/sprintf('%%%02X',ord($1))/eg;
$x;
}

sub unesc{
my $x=shift;
local $1;
$x=~s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
$x;
}

sub qm{
#quotemeta($_[0])
my $x=shift;
$x=~s/([\'\\])/\\$1/gs;
$x=~s/\r/\\r/gs;
$x=~s/\n/\\n/gs;
$x;
}

sub get_xml{
my ($s,%h,@a,@a1,@ad);
@a=split(/:\/\//,$_[0],2);
unshift @a,'http' if(!defined($a[1]));
@a[1,3]=split(/\//,$a[1],2);
@ad=@a[1,2]=split(/:/,$a[1],2);
@a1=@a;
$ad[1]||=80;
$a1[0]&&="$a[0]://";
$a1[2]&&=":$a[2]";
$a1[3]&&="/$a[3]";
if($proxy){
 @ad=split(/:/,$proxy,2);
 $a1[3]=join('',@a1);
}
print "+";
socket(SO,PF_INET,SOCK_STREAM,PROTO_TCP)&&
connect(SO,sockaddr_in($ad[1],inet_aton($ad[0])))&&goto OK;
close(SO);
$fail++;
return;
OK:
select(SO);$|=1;select(STDOUT);
print SO qq($_[1] $a1[3] HTTP/1.1
Host: $a1[1]
User-Agent: robot $url
Accept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5
Accept-Language: ru,be;q=0.8,en-us;q=0.5,en;q=0.3
Accept-Encoding: none
$_[2]Connection: close

);
while((!eof(SO))&&defined(my $x=<SO>)){$s.=$x;$x=~s/[\r\n]*//gs;$x||last}
$s=~s/(.*?): (.*?)[\r]\n/$h{lc($1)}=$2/gise;
undef $s;
while((!eof(SO))&&defined(my $x=<SO>)){$s.=$x}
close(SO);
my $s1;
$s=~s/.*(<\?xml.*?\/rss>).*/$s1=$1;''/se;
$s1,%h
}

sub url{
my $u=$_[0];
my $t=$_[1]||unesc($u);
"<a href=\"$u\">$t</a>"
}

my (@block,%item,%channel,@items,$cnt0,$cnt0_);

## 'id'=>[start,char,end,start1,char1.end1];
my %blocks=(
'rss.channel'=>[
sub{%channel=()},
undef,
undef,
undef,
sub{shift;$channel{$block[$#block]}=join('',@_)}
],
'rss.channel.item'=>[
sub{%item=()},
undef,
sub{push @items,{%item};undef %item},
undef,
sub{shift;$item{$block[$#block]}=join('',@_)},
undef
],
'rss.channel.item.link'=>[
]
);

#$blocks{'rdf:RDF.item'}=$blocks{'rss.channel.item'};

my %handlers=(
Start=>sub{parser_event(3,@_);push @block,$_[1];unshift @_,0;goto &parser_event},
Char=>sub{parser_event(1,@_);unshift @_,4;goto &parser_event},
End=>sub{parser_event(2,@_);while($_[1] ne pop @block){};unshift @_,5;goto &parser_event},
);


sub parser_event{
my $e=shift;
my $id=join('.',@block[0..$#block-($e>3)]);
#print "$id\n";
if(exists($blocks{$id})){
 my $h=$blocks{$id};
 goto ref($h)||return;
 HASH:return $h->{('Start','Char','End')[$e]}(@_);
 ARRAY:return defined(@$h[$e])?&{@$h[$e]}(@_):undef;
 SCALAR:return;
}
}


sub get_rss{
my $url=shift;
my $retry=2;
RETRY:
my ($x,%h,$ff,$ffb,$t,$p,$lm);
print "get $url\n";
if(substr($url,0,7) eq 'file://'){
$ff=substr($url,7);
return if(!-e $ff);
goto FILE;
}
add_cont($url);
($x,%h)=get_xml($url,'HEAD');
return if(!defined(%h));
if(exists($h{'last-modified'})){
$ffb="$temp/newz-".esc($url);
stat($ff="$ffb.".esc($h{'last-modified'}));
if(-e _){
FILE:
 print "== $ff\n";
 open(FF,fn(0,$ff)) or die "$! $ff";
 $x='';
 while(!eof(FF)&&(my $s=<FF>)){$x.=$s}
 close(FF);
}else{
 ($x,%h)=get_xml($url,'GET');
 return if(!defined($x));
}
}else{($x,%h)=get_xml($url,'GET');
}
$p=new XML::Parser::Lite;
$p->setHandlers(%handlers);
my ($enc,@a);
$x=~s/\<\?xml(.*?)\>/push @a,$1;"\<\?xml$1\>"/ges;
for(@a){$_=~s/ encoding\=\"(.*?)\"/$enc=lc($1);" encoding=\"$encoding\""/ges}
from_to($x,$enc,$encoding,HTMLCREF) if(defined($enc) && $enc ne $encoding);
undef %channel;
@items=();
if(index($x,'/rss>')<0){print "\nERROR: $url\n";$retry--?goto RETRY:return}
$p->parse($x);

## debug:
#$ff=substr(($ffb="$temp/newz-".esc($url)).".".esc($h{'last-modified'}||=gmtime),0,128);

if(exists($h{'last-modified'})){
 while(my $d=<$ffb.*>){unlink($d)}
 open(FF,fn(1,$ff)) or die $!;
 print FF $x;
 close(FF);
}
$x=$channel{title};
$x=~s/\:.*//s;
add_cont($channel{link},$x) if($channel{link});
addnews(@_);
}

my %htm=(
'lt'=>'<',
'gt'=>'>',
'amp'=>'&',
'quot'=>'"'
);

sub dehtml{
my $s=shift;
$s=~s/\&(.*?)\;/$htm{$1}||"\&$1;"/gse;
$s
}

sub htmlz{
my $s=shift;
for(keys %htm){$s=~s/$htm{$_}/$_/gs}
}

my @news0:shared;
my %news:shared;
my %nh:shared;
my %mm=('Jan'=>0,'Feb'=>1,'Mar'=>2,'Apr'=>3,'May'=>4,'Jun'=>5,'Jul'=>6,'Aug'=>7,'Sep'=>8,'Oct'=>9,'Nov'=>10,'Dec'=>11);

sub nkey{$_[0]->{xlink}||$_[0]->{link}||$_[0]->{description}}

# [param_redirect[,lang[,no_sort_by_time]]]
sub addnews{
my ($l,$x);
my $u=$_[0];
for(@items){
 $l=$_->{link};
 $_->{lang}||=$_[1];
 if($u){
  $x="$l\&";
  $x=~s/[\&\?\;]$u\=(.*?)\&/$l=dehtml($1);''/gse;
  if($x ne "$l\&"){
   $l="http://$l" if(index($l,'://')==-1);
   $_->{xlink}=$l;
  }
 };
 $l=nkey($_);
 if(exists($nh{$l})){$nh{$l}++}
 else{
  if($_[2]){
   $nh{$l}=0;
   push @news0,$_;
  }else{
   $nh{$l}=1;
   my ($t1,$t)=(0,$_->{pubDate});
   $t=~s/([0-9]{2})\:([0-9]{2})\:([0-9]{2})/$t1=$3+($2+$1*60)*60;''/e;
   $t=~s/([0-9]{1,2}) ([a-zA-Z]{3}) ([0-9]{4})/$t1+=($1+$mm{$2}*31+$3*365)*24*60*60;''/e;
   $t=~s/\+0([0-9])00/$t1-=$1*60*60;''/ex;
   $t=~s/\-0([0-9])00/$t1+=$1*60*60;''/ex;
   $t1++ while(exists($news{$t1}));
   $news{$t1}=$_
  }
  add_cont($l)
 }
}
}

sub mv{rename($_[0],$_[1])||`mv -f $_[0] $_[1]`}

my $time=gmtime;

sub time2h{
my $x="$_[0] GMT";
$x=~s/ /\&nbsp\;/gs;
$x
}

my ($t0,$time0,$tstamp,$counter,$rotate);



if(open(FT,"<$pub/time")){
 $t0=<FT>;chomp($t0);
 $time0=<FT>;chomp($time0);
 $counter=<FT>;chomp($counter);
 $tstamp=<FT>;chomp($tstamp);
 close FT;
 print "time: ",time-$t0,"\n";
};

$tstamp||=time;

sub cp{
my $x=quotemeta($_[0]);
my $y=quotemeta($_[1]);
`cp -f $x $y`
}

if($ARGV[0] eq 'test'){$web=$pub=$temp;}

get_rss("file://$pub/$xmlname.xml$enc",undef,undef,1);$cnt0_=$cnt0=$#news0;

if($ARGV[0] ne 'test' && (time-$t0>$rotate_time || $ARGV[0] eq 'rotate')){
 cp("$pub/$xmlname.xml$enc","$pub/$xmlname.$time.xml$enc");
 cp("$pub/$htmlname.html$enc","$pub/$htmlname.$time.html$enc");
 undef $t0;
 $rotate=1;
 $cnt0=-1;
}

if(!defined($t0)){($t0,$time0,$counter)=(time,$time,0)}

my $hhead=qq(<table width=100% height=0 border=0 cellpadding=0 cellspacing=0><tr><td align=left valign=top><b><font size=+2 face=times>Open&nbsp;Source&nbsp;News&nbsp;</font></b><sup>$subject</sup></td><td align=right valign=top>).
'<font size=-1>~~'.($rotate_time/3600).'h: '.time2h($time0)." - ".time2h($time)."<br>".
url("/dir/")."&nbsp;".url("/dir/".esc("$htmlname.$time0.html$enc"),"$time0&nbsp;GMT")."</font></td></tr></table>";

my $fh=qq(<html><head>$heads
</head><body>$hhead<ul>);

#get_rss('http://newsrss.bbc.co.uk/rss/russian/institutional/pda/rss.xml');

if($ARGV[0] eq 'test'){get_rss("file://$temp/fin.xml",undef,'en');goto noget;}

goto noget if($ARGV[0] eq 'html');

for(@lang){
for(my $st=0;$st<=$results;$st+=10){
# my $x='http://news.google.com/news?hl=en&ned=tus&ie=UTF-8&scoring=d&q=Belarus+OR+Belorussia+OR+Belarussian&output=rss';
 my $x='http://news.google.com/news?ie=UTF-8&scoring=d&output=rss';
 my $ll=$_;
 if(!exists($goo{$_})){
  $ll='';
  $x.="\&ned=t$_";
 }
 $x.="\&start=$st" if($st);
 for my $k (keys %{$goo{$ll}}){$x.="\&$k=$goo{$ll}->{$k}"}
 get_rss($x,'url',$_);
 last if($#items<0);
}
}
#get_rss('http://www.afn.by/news/rss/',undef,'ru');
#get_rss('http://www.euronews.net/rss/euronews_ru.xml',undef,'ru');

get_rss("file://$temp/fin.xml",undef,'en');
get_rss('http://news.tut.by/rss/all.rss',undef,'ru');
get_rss('http://www.charter97.org/export/index.xml',undef,'ru');
get_rss('http://news.yandex.ru/Belarus/index.rss','cl4url','ru');
#get_rss('http://blogs.yandex.ru/search.rss?how=tm&rd=2&text='.$ya,undef,'ru');
get_rss('http://blogs.yandex.ru/search.rss?how=tm&rd=2&text='.$ya.'&searchtarget_blogs=on',undef,'ru');
#get_rss('http://www.blogdigger.com/search?q=Belarus&sortby=date&type=rss',undef,undef);
#get_rss('http://www.belta.by/by/belta.rss',undef,'by');
get_rss('http://www.belta.by/ru/belta.rss',undef,'ru');

die "$fail failures!!" if($fail>2);

noget:
alarm(0);

my $cont0;
for(sort keys %cont){
 $i=unesc($_);
 $cont0.="<li>".url("http://$i",$cont{$_}) if($cont{$_} && $_ ne $cont{$_});
}

if(defined(&threads::list)){my @l=threads->list;for(@l){$_->join}}

for(sort keys %news){unshift @news0,delete($news{$_})}

open(FF,fn(1,"$pub/$xmlname.tmp.xml$enc")) or die "$! ".fn(1,"$pub/$xmlname.tmp.xml$enc");

print FF qq(<?xml version="1.0" encoding="$encoding"?>
<rss version="2.0">
<channel>
<title>$title</title>
<link>$url</link>
<description>Open Source News</description>);

my @fdelta=();

my $cnt=$#news0;
print "CNT: $cnt, $cnt0\n";
for(@news0){
 $cnt--;
 if((!$rotate)||$nh{nkey($_)}){
  my ($t,$s,$x,$d)=(dehtml($_->{title}),dehtml($_->{description}),dehtml($_->{xlink}));
  if(substr($_->{link},0,23) eq 'http://news.google.com/'){
   $s=~s/<br><table.*?><tr>.*?<font color.*?>(.*?)<\/font>.*?<\/font><br><font size\=\-1>(.*?)<\/font>.*?<\/table>.*/for(my $i=length($1);$i>=0;$i--){if(substr($t,-$i) eq substr($1,0,$i)){substr($t,-$i)='';last}};$d=$1;"<i>$1<\/i> $2"/gse;
   $d=~s/\&nbsp\;\-//gs;
   $x&&add_cont($x,dehtml($d));
  }
 if($lang_web{$_->{lang}}||!exists($_->{lang})){
  $s=($lang_web{$_->{lang}}==2?"[$_->{lang}] ":'').' '.'<b>'.url($_->{'link'},$t)."</b> $s";
  $fh.="<li>".(exists($_->{'xlink'})?url(unesc($x),'<i>link</i>').' ':'').$s;
  push @fdelta,qm($s) if($cnt>=$cnt0);
 }
 print FF "\n<item>";
 for my $tag(keys %{$_}){print FF "<$tag>$_->{$tag}</$tag>"}
 print FF "</item>";
 }
 if($cnt==$cnt0_){
  $tstamp=time;
  $fh.='<hr>';
  @fdelta[$#fdelta].='<hr>' if($cnt>$cnt0);
 }
}
print FF "\n</channel></rss>";
close FF;

my $cont_="<li>Contributors:<ul>".cur_cont($fh)."</ul>";

$fh.="\n<hr>\n$cont_\n</ul>$ad1$ad2</body></html>";

open(FH,fn(1,"$pub/$htmlname.tmp.html$enc")) or die "$! $pub/$htmlname.tmp.html";
print FH $fh;
close(FH);

if($#fdelta>=0){
 $counter++;
 open FH,fn(1,"$web/$deltaname$counter.js$enc");
 print FH "d([";
 my $c='';
 for(@fdelta){print FH "$c'".$_."'";$c=','};
 print FH "])";
 close(FH);
 for(my $i=$counter+1;unlink "$web/$deltaname$i.js$enc";$i++){}

 my $fj;
 for(1..$counter){$fj="<script src=\"newz$_.js\"></script>$fj"}
$fj=qq(<html><head>$heads<meta http-equiv="Content-Script-Type" content="text/javascript"><meta http-equiv="Refresh" content="300">

<noscript><meta http-equiv="Refresh" content="0; URL=/dir/newz.html"></noscript>
<script>tstamp='$tstamp';
l='$hhead';
c='$cont0'
</script>
<script src="newz.js"></script>
$fj
$ad1
</head>
<body onFocus=bodyFocus(1) onBlur=bodyFocus(0)><script>n()</script>.</body></html>);
# <table width=100% height=93 border=0 cellpadding=0 cellspacing=0><tr><td>$ad2</td></tr></table><br>.
open(FH,fn(1,"$web/index-j.html$enc"));
print FH $fj;
close(FH);
}


close(FH);

mv("$pub/$htmlname.tmp.html$enc","$pub/$htmlname.html$enc");
mv("$pub/$xmlname.tmp.xml$enc","$pub/$xmlname.xml$enc");

#if($rotate){
 unlink("$pub/time");
 open(FT,">$pub/time");
 print FT join("\n",$t0,$time0,$counter,$tstamp);
 close FT;
#}

__END__
License: Anarchy.

Все стихийные (включая социальные (включая юридические, моральные и т.д.))
аспекты существования и использования данного кода являются форс-мажорными
обстоятельствами и автора не интересуют.

(c) mahatma, 29.09.2006
