2
# Copyright Abandoned 1998 TCX DataKonsult AB & Monty Program KB & Detron HB
3
# This file is public domain and comes with NO WARRANTY of any kind
5
# This program is brought to you by Janne-Petteri Koilo with the
6
# administration of Michael Widenius.
8
# Rewritten with a lot of bug fixes by Jani Tolonen and Thimble Smith
11
# This program takes your mails and puts them into your database. It ignores
12
# messages with the same from, date and message text.
13
# You can use mail-files that are compressed or gzipped and ends with
27
$opt_socket = undef();
30
$opt_password = undef();
31
$opt_max_mail_size = 65536;
35
$opt_stop_on_error = 0;
38
my ($dbh, $progname, $mail_no_from_f, $mail_no_txt_f, $mail_too_big,
39
$mail_forwarded, $mail_duplicates, $mail_no_subject_f, $mail_inserted);
41
$mail_no_from_f = $mail_no_txt_f = $mail_too_big = $mail_forwarded =
42
$mail_duplicates = $mail_no_subject_f = $mail_inserted = 0;
46
# Remove the following message-ends from message
49
"\n-*\nSend a mail to .*\n.*\n.*\$",
50
"\n-*\nPlease check .*\n.*\n\nTo unsubscribe, .*\n.*\n.*\nIf you have a broken.*\n.*\n.*\$",
51
"\n-*\nPlease check .*\n(.*\n){1,3}\nTo unsubscribe.*\n.*\n.*\$",
52
"\n-*\nPlease check .*\n.*\n\nTo unsubscribe.*\n.*\$",
53
"\n-*\nTo request this thread.*\nTo unsubscribe.*\n.*\.*\n.*\$",
54
"\n -*\n.*Send a mail to.*\n.*\n.*unsubscribe.*\$",
55
"\n-*\nTo request this thread.*\n\nTo unsubscribe.*\n.*\$"
58
# Generate regexp to remove tails where the unsubscribed is quoted
62
foreach $tail (@remove_tail)
64
$tail =~ s/\n/\n[> ]*/g;
67
push @remove_tail,@tmp;
70
my %months = ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5,
71
'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10,
72
'Nov' => 11, 'Dec' => 12);
75
$progname =~ s/.*[\/]//;
85
my ($connect_arg, @args, $ignored, @defops, $i);
87
if (defined(my_which("my_print_defaults")))
89
@defops = `my_print_defaults mail_to_db`;
91
splice @ARGV, 0, 0, @defops;
95
print "WARNING: No command 'my_print_defaults' found; unable to read\n";
96
print "the my.cnf file. This command is available from the latest MySQL\n";
97
print "distribution.\n";
99
GetOptions("help","version","host=s","port=i","socket=s","db=s",
100
"user=s","password=s","max_mail_size=i","create","test",
101
"no_path","debug","stop_on_error","stdin")
102
|| die "Wrong option! See $progname --help\n";
104
usage($VER) if ($opt_help || $opt_version ||
105
(!$ARGV[0] && !$opt_create && !$opt_stdin));
107
# Check that the given inbox files exist and are regular files
108
for ($i = 0; ! $opt_stdin && defined($ARGV[$i]); $i++)
110
die "FATAL: Can't find inbox file: $ARGV[$i]\n" if (! -f $ARGV[$i]);
113
$connect_arg = "DBI:mysql:";
114
push @args, "database=$opt_db" if defined($opt_db);
115
push @args, "host=$opt_host" if defined($opt_host);
116
push @args, "port=$opt_port" if defined($opt_port);
117
push @args, "mysql_socket=$opt_socket" if defined($opt_socket);
118
push @args, "mysql_read_default_group=mail_to_db";
119
$connect_arg .= join ';', @args;
120
$dbh = DBI->connect("$connect_arg", $opt_user, $opt_password,
122
|| die "Couldn't connect: $DBI::errstr\n";
124
die "You must specify the database; use --db=" if (!defined($opt_db));
126
create_table($dbh) if ($opt_create);
131
process_mail_file($dbh, "READ-FROM-STDIN");
137
# Check if the file is compressed
138
if (/^(.*)\.(gz|Z)$/)
140
open(FILE, "zcat $_ |");
141
process_mail_file($dbh, $1);
146
process_mail_file($dbh, $_);
150
$dbh->disconnect if (!$opt_test);
152
$ignored = ($mail_no_from_f + $mail_no_subject_f + $mail_no_txt_f +
153
$mail_too_big + $mail_duplicates + $mail_fixed);
154
print "################################ Mail Report #################################\n\n";
155
print "Mails inserted:\t\t\t\t\t$mail_inserted\n";
156
print "--------------- ";
157
print "=" . "=" x length("$mail_inserted") . "=\n\n";
160
print "Ignored mails\n";
161
print "-------------\n";
164
print "Reason: mail without \"From:\" -field:\t\t$mail_no_from_f\n";
172
print "Reason: mail without message:\t\t\t$mail_no_txt_f\n";
178
if ($mail_no_subject_f)
180
print "Reason: mail without subject:\t\t\t$mail_no_subject_f\n";
188
print "Reason: mail too big, over $opt_max_mail_size bytes:\t\t";
190
print " (see --max_mail_size=#)\n";
196
if ($mail_duplicates)
198
print "Reason: duplicate mail, or in db already:\t$mail_duplicates\n";
206
print "Reason: mail was an unsubscribe - mail:\t\t$mail_fixed\n";
213
print "=" . "=" x length("$ignored") . "=\n";
214
print "Total number of ignored mails:\t\t\t$ignored\n\n";
216
print "Total number of mails:\t\t\t\t";
217
print $mail_inserted + $ignored;
219
print sprintf("%.1f", ($mail_inserted + $ignored) ? (($mail_inserted / ($mail_inserted+$ignored)) * 100) : 0.0);
221
print sprintf("%.1f", ($mail_inserted + $ignored) ? (($ignored / ($mail_inserted + $ignored)) * 100) : 0);
223
print "################################ End Report ##################################\n";
239
mail_id MEDIUMINT UNSIGNED NOT NULL auto_increment,
240
message_id VARCHAR(255),
241
in_reply_to VARCHAR(255),
242
date DATETIME NOT NULL,
243
time_zone VARCHAR(20),
244
mail_from VARCHAR(120) NOT NULL,
249
txt MEDIUMTEXT NOT NULL,
250
file VARCHAR(64) NOT NULL,
251
hash INTEGER NOT NULL,
255
PRIMARY KEY (mail_from, date, hash))
256
ENGINE=MyISAM COMMENT=''
258
$sth = $dbh->prepare($query) or die $DBI::errstr;
259
$sth->execute() or die "Couldn't create table: $DBI::errstr\n";
263
#### inbox processing. Can be either a real file, or standard input.
266
sub process_mail_file
268
my ($dbh, $file_name) = @_;
269
my (%values, $type, $check);
271
$file_name =~ s/.*[\/]// if ($opt_no_path);
279
chop if (substr($_, -1, 1) eq "\r");
280
if ($type ne "message")
282
if (/^Reply-To:\s*(.*)/i)
287
elsif (/^From: (.*)/i)
302
elsif (/^Subject: (.*)/i)
307
elsif (/^Message-Id:\s*(.*)/i)
309
$type = "message_id";
313
elsif (/^In-Reply-To:\s*(.*)/i)
315
$type = "in_reply_to";
319
elsif (/^Date: (.*)/i)
321
date_parser($1, \%values, $file_name);
324
# Catch those fields that we don't or can't handle (yet)
337
if ($type eq 'message_id' || $type eq 'in_reply_to')
341
$values{$type} .= $_;
344
elsif ($check != 0 && $_ ne "") # in case of forwarded messages
346
$values{$type} .= "\n" . $_;
349
elsif (/^From .* \d\d:\d\d:\d\d\s\d\d\d\d/ ||
350
/^From .* \d\d\d\d\s\d\d:\d\d:\d\d/)
352
$values{'hash'} = checksum("$values{'message'}");
353
update_table($dbh, $file_name, \%values);
358
elsif (/-* forwarded message .*-*/i) # in case of forwarded messages
360
$values{$type} .= "\n" . $_;
366
$values{$type} .= "\n" . $_;
369
if (defined($values{'message'}))
371
$values{'hash'} = checksum("$values{'message'}");
372
update_table($dbh, $file_name, \%values);
377
#### get date and timezone
382
my ($date_raw, $values, $file_name, $tmp) = @_;
384
# If you ever need to change this test, be especially careful with
385
# the timezone; it may be just a number (-0600), or just a name (EET), or
386
# both (-0600 (EET), or -0600 (EET GMT)), or without parenthesis: GMT.
387
# You probably should use a 'greedy' regexp in the end
388
$date_raw =~ /^\D*(\d{1,2})\s+(\w+)\s+(\d{2,4})\s+(\d+:\d+)(:\d+)?\s*(\S+.*)?/;
390
if (!defined($1) || !defined($2) || !defined($3) || !defined($4) ||
391
!defined($months{$2}))
393
if ($opt_debug || $opt_stop_on_error)
395
print "FAILED: date_parser: 1: $1 2: $2 3: $3 4: $4 5: $5\n";
396
print "months{2}: $months{$2}\n";
397
print "date_raw: $date_raw\n";
398
print "Inbox filename: $file_name\n";
400
exit(1) if ($opt_stop_on_error);
401
$values->{'date'} = "";
402
$values->{'time_zone'} = "";
405
$tmp = $3 . "-" . $months{$2} . "-" . "$1 $4";
406
$tmp.= defined($5) ? $5 : ":00";
407
$values->{'date'} = $tmp;
408
print "INSERTING DATE: $tmp\n" if ($opt_debug);
409
$values->{'time_zone'} = $6;
418
my($dbh, $file_name, $values) = @_;
419
my($q, $tail, $message);
421
if (!defined($values->{'subject'}) || !defined($values->{'to'}))
423
$mail_no_subject_f++;
424
return; # Ignore these
426
$message = $values->{'message'};
427
$message =~ s/^\s*//; # removes whitespaces from the beginning
430
$message =~ s/[\s\n>]*$//; # removes whitespaces and '>' from the end
431
$values->{'message'} = $message;
432
foreach $tail (@remove_tail)
434
$message =~ s/$tail//;
436
if ($message ne $values->{'message'})
438
$message =~ s/\s*$//; # removes whitespaces from the end
440
goto restart; # Some mails may have duplicated messages
443
$q = "INSERT INTO my_mail (";
459
$q.= (defined($values->{'message_id'}) ?
460
$dbh->quote($values->{'message_id'}) : "NULL");
462
$q.= (defined($values->{'in_reply_to'}) ?
463
$dbh->quote($values->{'in_reply_to'}) : "NULL");
465
$q.= "'" . $values->{'date'} . "',";
466
$q.= (defined($values->{'time_zone'}) ?
467
$dbh->quote($values->{'time_zone'}) : "NULL");
469
$q.= defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL";
471
$q.= defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL";
473
$q.= defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL";
475
$q.= defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL";
477
$q.= $dbh->quote($values->{'subject'});
479
$q.= $dbh->quote($message);
481
$q.= $dbh->quote($file_name);
483
$q.= "'" . $values->{'hash'} . "'";
486
# Don't insert mails bigger than $opt_max_mail_size
487
if (length($message) > $opt_max_mail_size)
491
# Don't insert mails without 'From' field
492
elsif (!defined($values->{'from'}) || $values->{'from'} eq "")
501
# Don't insert mails without the 'message'
502
elsif ($message eq "")
510
# This should never happen. This means that the above q failed,
511
# but it wasn't because of a duplicate mail entry
512
elsif (!($DBI::errstr =~ /Duplicate entry /))
514
die "FATAL: Got error :$DBI::errstr\nAttempted query was: $q\n";
519
print "Duplicate mail: query: $q\n" if ($opt_debug);
525
#### In case you have two identical messages we wanted to identify them
526
#### and remove additionals; We do this by calculating a hash number of the
527
#### message and ignoring messages with the same from, date and hash.
528
#### This function calculates a simple 32 bit hash value for the message.
534
my ($crc, $i, $count);
535
$count = length($txt);
536
for ($crc = $i = 0; $i < $count ; $i++)
538
$crc = (($crc << 1) + (ord (substr ($txt, $i, 1)))) +
539
(($crc & (1 << 30)) ? 1 : 0);
540
$crc &= ((1 << 31) -1);
546
#### my_which is used, because we can't assume that every system has the
547
#### which -command. my_which can take only one argument at a time.
548
#### Return values: requested system command with the first found path,
549
#### or undefined, if not found.
557
return $command if (-f $command && -x $command);
558
@paths = split(':', $ENV{'PATH'});
559
foreach $path (@paths)
561
$path = "." if ($path eq "");
562
$path .= "/$command";
563
return $path if (-f $path && -x $path);
569
#### usage and version
578
print "$progname version $VER\n";
583
$progname version $VER
585
Description: Insert mails from inbox file(s) into a table. This program
586
can read group [mail_to_db] from the my.cnf file. You may want to have db
587
and table set there at least.
589
Usage: $progname [options] file1 [file2 file3 ...]
590
or: $progname [options] --create [file1 file2...]
591
or: cat inbox | $progname [options] --stdin
593
The last example can be used to read mails from standard input and can
594
useful when inserting mails to database via a program 'on-the-fly'.
595
The filename will be 'READ-FROM-STDIN' in this case.
598
--help Show this help and exit.
599
--version Show the version number and exit.
600
--debug Print some extra information during the run.
601
--host=... Hostname to be used.
602
--port=# TCP/IP port to be used with connection.
603
--socket=... MySQL UNIX socket to be used with connection.
604
--db=... Database to be used.
605
--user=... Username for connecting.
606
--password=... Password for the user.
607
--stdin Read mails from stdin.
608
--max_mail_size=# Maximum size of a mail in bytes.
609
Beware of the downside letting this variable be too big;
610
you may easily end up inserting a lot of attached
611
binary files (like MS Word documents etc), which take
612
space, make the database slower and are not really
613
searchable anyway. (Default $opt_max_mail_size)
614
--create Create the mails table. This can be done with the first run.
615
--test Dry run. Print the queries and the result as it would be.
616
--no_path When inserting the file name, leave out any paths of
618
--stop_on_error Stop the run, if an unexpected, but not fatal error occurs
619
during the run. Without this option some fields may get
620
unwanted values. --debug will also report about these.