#!/usr/bin/perl -w

######################################################################
# 
# Program: fetch_unanswered.pl
#
#       Retrieve articles from to which no reply has yet been posted.  
#	Assumes that arguments passed to program are newsgroup names.	
#       Articles are all printed to the standard output.
#
#       options:
#               -j Turn off threading of articles by subject.
#               -m <limit>              Look back at most <limit> headers/nov.
#               -n <limit>              Fetch at most <limit> NOV records
#                                       with one request to server.
#               -s <news server name>   override default news server 
#
# Current version of this program will be accessible from.
#       http://www.software-path.com/scripts.html
#
# Please send any comments to: RonaldWS@software-path.com
#
# A version with a reply feature exists.  The reply feature is not included
# here since it requires about 200 lines of additional unrelated code and 
# belongs in a separate script. CPAN script submission currently requires 
# that "It must be a single file ...". Contact the author if interested in 
# the reply feature.
#
######################################################################

use strict;
use News::NNTPClient;
use Getopt::Std;

my $VERSION = '0.17';

use vars qw($opt_m $opt_j $opt_n $opt_s $VERSION);

# server will be set to (in order of decreasing priority)
#       -s command line parameter
#       NNTPSERVER environment variable
#       /etc/nntpserver
#       default set here
my $server;
my $default_server = '"set default_server or use -s parameter"';

my $xover_batch_size = 500;

######################################################################
# "Nice to have" enhancements:
#       support for newnews
#       time estimation
#       FAQ filtering option/kill file.
######################################################################

my $news_client;
my %unanswered;
my %record_dup_subj;

######################################################################
# Print a status message to STDERR.  If caller does not provide
# line termination then terminate line with time stamp and LF("\n").
######################################################################
sub post_console_message {
        print STDERR @_;
        print STDERR " (", scalar(localtime()), ")\n" unless (
                $_[$#_] =~ /\n/         # Last parm has LF.
        );
}

######################################################################
# Here we remove messages with subjects that look like replies and
# begin to track groups of messages with the same subject.
# User may request no filter by subject.
######################################################################
sub FilterSubject {
        my $msg_id = shift;
        my $subj = lc(shift);
        my $has_ref = shift;

        $subj =~ s/^\s*//;
        $subj =~ s/\s*$//;

        # if subject filtering remove msgs with subject that looks like reply
        delete $unanswered{$msg_id} if (
                ($subj =~ s/^re(\:?)\s+//) && (! $has_ref)
        );

        # List of message id's by subject.  Advanced technique - sorry!
        push @{$record_dup_subj{$subj}}, $msg_id;
}

######################################################################
# Look through duplicate subject hash for cases where multiple messages
# had the same subject and remove their message id's from the unanswered
# list.
######################################################################
sub RemoveDuplicateSubject {
        foreach my $msg_id_lh (values %record_dup_subj) {
                if (scalar(@$msg_id_lh) > 1) {
                        foreach my $dup_msg_id (@$msg_id_lh) {
                                delete $unanswered{$dup_msg_id};
                        }
                }
        }
}

######################################################################
# Use NNTP XOVER request to fetch header information needed to
# determine which articles have not yet received a response.
# This is one of the more efficient approaches.
######################################################################
sub SetUnansweredXover {
        my ($news_client, $first_num, $last_num, $batch_size) = @_;

        my ($batch_first, $batch_last);
        my $overview_fmt;
        my ($i, %overview_fields, $id_field, $ref_field, $subject_field);
        my @all_ref;

        $overview_fmt = $news_client->list('overview.fmt');
        die $news_client->message() unless ($news_client->ok());
        
        %overview_fields = map((uc($_), $i++), 
                grep(s/\s*$//, @$overview_fmt));
        $id_field = $overview_fields{'MESSAGE-ID:'};
        $ref_field = $overview_fields{'REFERENCES:'};
        $subject_field = $overview_fields{'SUBJECT:'};

        for (   $batch_first = $first_num,
                $batch_last = $first_num + $batch_size -1;
                $batch_first < $last_num;
                $batch_first = $batch_last + 1,
                $batch_last = $batch_first + $batch_size -1
        ) {
                $batch_last = $last_num if ($batch_last > $last_num);
                foreach my $xover_line 
                        ($news_client->xover("${batch_first}-${batch_last}")) {
                        my ($msg_num, $msg_id, $ref, $subject) =
                                (split /\t/, $xover_line)
                                 [0, $id_field +1, 
                                        $ref_field +1, $subject_field +1];
                        my $has_ref = (defined($ref) && $ref);
                        if ($has_ref) {
                                foreach my $ref_id (split(' ', $ref)) {
                                        delete $unanswered{$ref_id};
                                }
                        }
                        else {
                                $unanswered{$msg_id} = $msg_num;
                        }
                        FilterSubject($msg_id, $subject, $has_ref) 
                                unless ($opt_j);
                }
                post_console_message 'Processed requests for ', 
                        $batch_last - $first_num +1,
                        " NOV records of ", $last_num - $first_num +1, '.';
        }
}

######################################################################
# Fetch each article header, one at a time, to determine which
# articles have not yet received any response.
# This is a very inefficient approach but does not require any
# NNTP extension services.
######################################################################
sub SetUnansweredHead {
        my ($news_client, $first_num, $last_num) = @_;

        my ($article_num, $err_count);
        my $i = 0;
        
        for (   $article_num = $first_num; 
                $article_num <= $last_num;
                $article_num++) {
                my $head;
                my ($msg_id, $ref_id);

                post_console_message("counting heads: $i") if ((++$i % 100)==0);       
                $head = $news_client->head($article_num);
                unless ($news_client->ok()) {
                        $err_count++ if (
                                $news_client->message() !~ 
                                        /bad article number/i
                        );
                        next;
                }

                ($msg_id) = grep(/Message\-ID\:/i, @$head);
                ($msg_id) = ($msg_id =~ /Message\-ID\: (\<.*?\>)/i);
                ($ref_id) = grep(/References\:/i, @$head);
                if (defined $ref_id) {
                        ($ref_id) = ($ref_id =~ /References\: (\<.*?\>)/i);
                        delete $unanswered{$ref_id};
                }
                else {
                        $unanswered{$msg_id} = $article_num;
                }
                unless ($opt_j) {
                        my ($subject) = grep(/Subject\:/i, @$head);
                        ($subject) = ($subject =~ /Subject: (.*)/i);
                        FilterSubject($msg_id, $subject, defined($ref_id));
                }
        }
        post_console_message("counting heads: $i") unless (($i % 100)==0);
        post_console_message("*Warning* errors: $err_count.") if ($err_count);
}

######################################################################
# Here we expend too much effort to be platform independent.
# We really should `cat ...`
######################################################################
sub read_etc_nntpserver {
        my $rc;

        open(FH, '</etc/nntpserver') || return undef;
        $rc = scalar(<FH>);
        close(FH);
        $rc =~ s/\s*$//;
        return $rc || undef;
}

######################################################################
# Fetch unanswered articles for one news group.
######################################################################
sub fetch_group_unanswered {
	my $group = shift;

	# get news article number range
	my ($first_num, $last_num) = $news_client->group($group);
	die $news_client->message() unless ($news_client->ok());
	$first_num = $last_num - $opt_m +1 if ($opt_m && ($opt_m =~ /^\d+$/));

	# Test scaffolding.  Under Linux this forces overview analysis to fail.
# 	$news_client->quit();
# 	$news_client = new News::NNTPClient($server);
#	$news_client->debug(0);

	post_console_message('Finding unanswered articles.');

	%unanswered = ();
	%record_dup_subj = ();
	
	######################################################################
	# The actual work of deciding which articles for the group are
	# unanswered is done here.
	######################################################################
	eval {
        	SetUnansweredXover(
                	$news_client, $first_num, $last_num, $xover_batch_size
        	);
	};
	if ($@) {
        	post_console_message 'Xover failed; trying one message at a ',
        		'time.  This may take a while.', "\n";

	# more test scaffolding
#	       $news_client->mode_reader(); 
#	       $news_client->group($group);

		SetUnansweredHead($news_client, $first_num, $last_num);
	}

	unless ($opt_j) {
        	RemoveDuplicateSubject();
        	%record_dup_subj = ();  # free what may be substantial memory
	}

	######################################################################
	# End of "find unanswered" code block.
	######################################################################
	post_console_message('Done finding unanswered articles.');
	post_console_message('Fetching ', scalar(keys %unanswered),
        	' unanswered articles.');

	# Fetch each unanswered article from the news server
	# and print it to the standard output.
	foreach my $article_id (
        	sort {$unanswered{$b} <=> $unanswered{$a}} keys(%unanswered)
        ) {
        	my $msg = $news_client->article($article_id);
        	print @$msg;
	}
	
}

######################################################################
# Start of program.
######################################################################

# process command line options
getopts("jm:n:rs:");
unless (@ARGV) {
	print <<EOT;
Usage: fetch_unanswered.pl [options] newsgroup [newsgroup ...]
	see perldoc fetch_unanswered.pl for options
	(use fetch_unaswered-${VERSION}.pl where appropriate)
EOT
	exit(0);
}
$server = $opt_s if (defined($opt_s));

$xover_batch_size = $opt_n if (defined($opt_n));

$server = $ENV{'NNTPSERVER'} if (
        (! defined($server))            &&
        $ENV{'NNTPSERVER'}
);
$server = read_etc_nntpserver() if (
        (! defined($server))            &&
        (-r '/etc/nntpserver')
);
$server = $default_server unless(defined $server);

# connect to news server
$news_client = new News::NNTPClient($server);
unless ($news_client->ok()) {
        $news_client->quit();
        die $news_client->message();
}

$news_client->debug(0);
$news_client->mode_reader();

foreach my $news_group (@ARGV) {
	eval{fetch_group_unanswered($news_group);};
	print STDERR $@ if($@);
}

post_console_message('Done.');

$news_client->quit();


=head1 NAME

fetch_unanswered.pl - Retrieve news articles that do not have a reply. 

=head1 DESCRIPTION


Retrieve articles from one newsgroup to which no reply has yet been posted.  
Newsgroup names are passed as command line arguments to the program.
Articles are all printed to the standard output and status messages are 
printed to STDERR.

=head1 COMMAND LINE OPTIONS

=over 4

=item -j

Turn off threading of articles by subject.  Turning this off also
saves (some) time and memory.  Article threading eliminates
articles starting with 'Re:' and groups of articles with the
same subject.

=item -m <Max headers to look back.>

Look back at most -m headers/nov records.

=item -n <NOV record batch size>

Limit number of NOV records we read from server with one
request.  A small number will result in more frequent
feedback to the user.   

=item  -s <news server name>

Override default news server.

        Default is: (in order of decreasing priority)
        value of NNTPSERVER environment variable
        value from /etc/nntpserver file
        value set at start of fetch_unanswered.pl source code.

=back 4

=head1 PREREQUISITES

This script requires the C<strict> module.  It also requires
C<Getopt::Std> and C<News::NNTPClient>.

=pod OSNAMES

any

=pod SCRIPT CATEGORIES

Networking

=cut
