#!/usr/bin/perl
my $rcs='@(#)$Header: /home/jamesb/CVS/junkfilter/src/junkFilter,v 1.1.1.1 2004/07/11 16:53:07 jamesb Exp $';
#
# Module : junkFilter
# Purpose: Junk mail filter.
# Author : B.James
# Date   : $Date: 2004/07/11 16:53:07 $
# Version: $Revision: 1.1.1.1 $
#
# 11/07/2004: B.James: Updated to use SDBM_File instead of DB_File
#

=pod

=head1 NAME

junkFilter - Junk mail filter.

=head1 SYNOPSIS

junkFilter [ -s | -h | -p file | -j file | -g file ]

=head2 Options:-

=over

=item -s	Setup the data directory

=item -j	Processes the specified file as a junk file. Builds the junk tables.

=item -g	Processes the specified file as a good word file.

=item -p	Processes the specified file and says if it thinks it's junk or not.

=item -h	Help.

=back

With no arguments, processes stdin against the database, and tries to determine if it's junk.

Exit code ( $? )is set as:-
0	Junk email
1	Good email. (or undecided about it).

=head1 DESCRIPTION

=head2 Overview

Designed to be used in conjunction with procmail.
Based on a Bayesian Heuristic, it builds a database of good and junk words, 
with associated probabilities and processes messages against the database 
to determine if they are junk mail or not.

=head1 PREREQUISITES

=over 

=item C<strict>

=item C<SDBM_File>

=item C<FileHandle>

=item C<Getopt::Std>

=back

=head1 INSTALLATION

=head2 Setting up

Copy the program to a location on your path, or in your home directory.

The program will automatically create the data directory using the -s switch.
The data files will be placed in the .junkFilter directory in the users home
directory.


=head2 Configuration

To be effective, it will need to be taught good or junk emails. Basically it just takes a text
as an argument to the -j or -g options and builds the database of words from that.

This program is designed to be used under procmail, which allows much flexibility in its usage.

Here are the procmail rules I use:-

=over

# Run the mail through the junkFilter
:0 Wibc: junk.lock
| $HOME/bin/junkFilter

# If it returns 1 then it's junked, otherwise drop through and send the mail to the inbox
:0 a:
$HOME/mail/junk

=back

This configuration is one I feel safest with, as it won't actually delete the emails it finds as junk.

=head1 OSNAMES

Unix or Unix-likes.

=head1 SCRIPT CATEGORIES

Mail

=head1 README

Junk mail filter.  Designed to be used in conjunction with procmail, but not necessarily
restricted to it.
Based on a Bayesian Heuristic (yawn), it builds a database of good and junk words, 
with associated probabilities and processes messages against the database 
to determine if they are junk mail or not.

=head1 LICENCE

Copyright (c) 2004, Bruce James

This program is free software; you can redistribute it and/or modify it under
the same terms as Perl.

=head1 DISCLAIMER

Use at your own risk. This program is supplied as is, and it is up to you whether you choose to use it or
not. As the program is based on a heuristic, it cannot guarantee to give accurate results.
I cannot be held responsible for any data or email loss that may occur during the use or misconfiguration of this program.

=head1 AUTHOR

Bruce James (custard@cpan.org)

=cut

package junkFilter;
@ISA = qw( Exporter );
@EXPORT = qw();

use strict;
#use DB_File;
use SDBM_File;
use FileHandle;
use Getopt::Std;

my $VERSION=0.2;  	# $Revision: 1.1.1.1 $ - 1


sub new {
	my $class=shift;
	my $this={};
	bless $this,$class;
	return $this->constructor( @_ );
}


sub constructor {
	my $this=shift;
	my %jdb;
	my %gdb;
	my %sdb;

	my $jfdir = $this->checkDirectory();

	if ($jfdir) {
		my $jdb=tie( %jdb, 'SDBM_File', $jfdir."/junkWords.db", O_RDWR | O_CREAT, 0755);
		my $gdb=tie( %gdb, 'SDBM_File', $jfdir."/goodWords.db", O_RDWR | O_CREAT, 0755);
		my $sdb=tie( %sdb, 'SDBM_File', $jfdir."/junkStatus.db", O_RDWR | O_CREAT, 0755);
	} else {
		die( "Failed checking for directory $jfdir.\n" );
	}

	$this->{junkDB}=	\%jdb;
	$this->{goodDB}=	\%gdb;
	$this->{statusDB}=	\%sdb;

	return $this;
}


# emailGood & emaiJunk getters & setters
# I know it's a bit long-winded, but it leaves
# the opportunity to swap databases in the future.

sub getJunkCount {
	my $this=shift;
	my $word=shift || die( "getJunkCount: no word!\n" );
	my $db=$this->{junkDB};
	return $db->{$word};
}
sub setJunkCount {
	my $this=shift;
	my $word=shift || die( "setJunkCount: no word!\n" );
	my $count=shift || 0;
	my $db=$this->{junkDB};
	$db->{$word}=$count;
	return;
}
sub getGoodCount {
	my $this=shift;
	my $word=shift || die( "getGoodCount: no word!\n" );
	my $db=$this->{goodDB};
	return $db->{$word};
}
sub setGoodCount {
	my $this=shift;
	my $word=shift || die( "setGoodCount: no word!\n" );
	my $count=shift || 0;
	my $db=$this->{goodDB};
	$db->{$word}=$count;
	return;
}


# emailStatus getters & setters

sub getGoodTotalCount {
	my $this=shift;
	my $db=$this->{statusDB};
	return $db->{'goodCount'} || 0;;
}

sub getJunkTotalCount {
	my $this=shift;
	my $db=$this->{statusDB};
	return $db->{'junkCount'} || 0;;
}

sub setGoodTotalCount {
	my $this=shift;
	my $count=shift || 0;
	my $db=$this->{statusDB};
	$db->{'goodCount'}=$count;
	return;
}

sub setJunkTotalCount {
	my $this=shift;
	my $count=shift || 0;
	my $db=$this->{statusDB};
	$db->{'junkCount'}=$count;
	return;
}


sub checkDirectory {
	my $this=shift;

	my $home=$ENV{HOME} || die("Can't locate user home dir." );
	my $jfdir=$home."/.junkFilter";

	print( "Checking $jfdir exists..." );
	if (-e $jfdir && -d $jfdir ) {
		print( " it does..." );
	} else {
		print( " it doesn't. Creating it..." );
		mkdir( $jfdir, 0700 );	# rwx------
	}

	# Make sure the directory exists and is writable.
	if (-e $jfdir && -d $jfdir && -w $jfdir ) {
		print( " Ok.\n" );
		return $jfdir;
	} else {
		print( " Failed.\n" );
		return undef;
	}
}

sub processFile {
	# Process a file and produce a hash of words & counts, and a total count
	my $this=shift;
	my $fh=shift;
	my %words;
	my $ignore;
	my $wordCount;

	if ($fh && (!ref($fh))) {
		$fh=new FileHandle( $fh );
	} else {
		$fh=*STDIN;
	}

	while( <$fh> ) {
		(/: base64/) && ($ignore=1);
		(/^--/) && ($ignore=0);
		next if $ignore;
		s/<.+>/!htmltags!/g;
		s/\d+/!consecutivenumbers!/g;
		my @words=split(/\W/,$_);
		foreach (@words) {
			s/\W//g;
			s/\s*//g;
			$_=lc($_);
			$words{$_}++ if $_;
			$wordCount++;
		}
	}

	return \%words, $wordCount;
}


sub updateJunkData {
	# Update Database from a file
	my $this=shift;
	my $file = shift || return;
	my $words;
	my $word;
	my $wordCount;


	print( "Compiling junk words from $file...\n" );
	($words,$wordCount) = $this->processFile( $file );	

	# Add this total wordcount to the running total
	my $count = $this->getJunkTotalCount;	
	$this->setJunkTotalCount( $count + $wordCount );

	print( "Storing in database...\n" );
	# get all words and store in mySql database
	foreach $word (keys %{$words}) {
		my $curcount=$this->getJunkCount( $word );
		$this->setJunkCount( $word, $curcount + $words->{$word} );
	}
	print( "Done ($wordCount words processed).\n" );
}

sub updateGoodData {
	# Update Database from a file
	my $this=shift;
	my $file = shift || return;
	my $words;
	my $word;
	my $wordCount;

	print( "Compiling good words from $file...\n" );
	($words,$wordCount) = $this->processFile( $file );	

	# Add this total wordcount to the running total
	my $count = $this->getGoodTotalCount;	
	$this->setGoodTotalCount( $count + $wordCount );

	print( "Storing in database...\n" );
	# get all words and store in mySql database
	foreach $word (keys %{$words}) {
		my $curcount=$this->getGoodCount( $word );
		$this->setGoodCount( $word, $curcount + $words->{$word} );
	}
	print( "Done ($wordCount words processed).\n" );
}


sub processMail {
	my $this=shift;
	my $file = shift;

	my ($junkCount,$goodCount);
	my ($junkProb,$goodProb);
	my $words;
	my $wordCount;
	my $word;
	my $total;

	($words,$wordCount) = $this->processFile( $file );	

	my $totalJunkWords = $this->getJunkTotalCount || 1;	# Foil the divide by zero trap
	my $totalGoodWords = $this->getGoodTotalCount || 1;
	
	# Find ratio of good words to junk words
	my $junkRatio = $totalJunkWords / $totalGoodWords;

	foreach $word (keys %{$words}) {

		# Correct only good count to even the ratio
		$junkCount = $this->getJunkCount( $word );
		$goodCount = $this->getGoodCount( $word ) * $junkRatio;

		# Add up probabilities
		($total = $junkCount + $goodCount) && (	
			$junkProb+=$junkCount/$total,
			$goodProb+=$goodCount/$total
		);
	}

	$total=$junkProb+$goodProb;
	if ($total) {
		$junkProb=$junkProb/$total;
		$goodProb=$goodProb/$total;
	} else {
		print( "Undecided...\n" );
		return 1;
	}

	# Update counters for stats
	if ($junkProb > 0.50) {
		print( "It's junk...\n" );
		return 0;
	} else {
		print( "It's good...\n" );
		return 1;
	}
}


sub run {
	my $this=shift;
	my $file=shift;
	my %args;
	my $rc=0;
	
	getopts( 'hsij:g:p:', \%args );

	if ($args{s}) {
		# Checks the data directory & creates if necessary.
		$this->checkDirectory();
		return 1;
	}
	if ($args{j}) {
		# Read junk mail & good mail files & update word database
		$this->updateJunkData( $args{j} );
		return 1;
	}	
	if ($args{g}) {
		# Read good mail & good mail files & update word database
		$this->updateGoodData( $args{g} );
		return 1;
	}	
	if ($args{p}) {
		# Process file and filter according to probability of being junk
		$rc = $this->processMail( $args{p});
		return $rc;
	}

	if ($args{h}) {
		print( "junkFilter version $VERSION\n" );
		print( "junkFilter -s			Create database dir if necessary.\n");
		print( "           -p file		Process file against database\n");
		print( "           -j file | -g file	Process junk & good files respectively\n");
		print( "           -h 			Help\n");
		return 1;
	}

	$rc = $this->processMail();
	return $rc;
}

my $main=new junkFilter();
exit $main->run( @ARGV );