~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
#!/usr/bin/perl -w
2
# Copyright Abandoned 1998 TCX DataKonsult AB & Monty Program KB & Detron HB
3
# This file is public domain and comes with NO WARRANTY of any kind
4
#
5
# This program is brought to you by Janne-Petteri Koilo with the 
6
# administration of Michael Widenius.
7
#
8
# Rewritten with a lot of bug fixes by Jani Tolonen and Thimble Smith
9
# 15.12.2000
10
#
11
# This program takes your mails and puts them into your database. It ignores
12
# messages with the same from, date and message text.
13
# You can use mail-files that are compressed or gzipped and ends with
14
# -.gz or -.Z.
15
16
use DBI;
17
use Getopt::Long;
18
19
$| = 1;
20
$VER = "3.0";
21
22
$opt_help          = 0;
23
$opt_version       = 0;
24
$opt_debug         = 0;
25
$opt_host          = undef();
26
$opt_port          = undef();
27
$opt_socket        = undef();
28
$opt_db            = "mail";
29
$opt_user          = undef();
30
$opt_password      = undef();
31
$opt_max_mail_size = 65536;
32
$opt_create        = 0;
33
$opt_test          = 0;
34
$opt_no_path       = 0;
35
$opt_stop_on_error = 0;
36
$opt_stdin         = 0;
37
38
my ($dbh, $progname, $mail_no_from_f, $mail_no_txt_f, $mail_too_big,
39
    $mail_forwarded, $mail_duplicates, $mail_no_subject_f, $mail_inserted);
40
41
$mail_no_from_f = $mail_no_txt_f = $mail_too_big = $mail_forwarded =
42
$mail_duplicates = $mail_no_subject_f = $mail_inserted = 0;
43
$mail_fixed=0;
44
45
#
46
# Remove the following message-ends from message
47
#
48
@remove_tail= (
49
"\n-*\nSend a mail to .*\n.*\n.*\$",
50
"\n-*\nPlease check .*\n.*\n\nTo unsubscribe, .*\n.*\n.*\nIf you have a broken.*\n.*\n.*\$",
51
"\n-*\nPlease check .*\n(.*\n){1,3}\nTo unsubscribe.*\n.*\n.*\$",
52
"\n-*\nPlease check .*\n.*\n\nTo unsubscribe.*\n.*\$",
53
"\n-*\nTo request this thread.*\nTo unsubscribe.*\n.*\.*\n.*\$",
54
"\n -*\n.*Send a mail to.*\n.*\n.*unsubscribe.*\$",
55
"\n-*\nTo request this thread.*\n\nTo unsubscribe.*\n.*\$"
56
);
57
58
# Generate regexp to remove tails where the unsubscribed is quoted
59
{
60
  my (@tmp, $tail);
61
  @tmp=();
62
  foreach $tail (@remove_tail)
63
  {
64
    $tail =~ s/\n/\n[> ]*/g;
65
    push(@tmp, $tail);
66
  }
67
  push @remove_tail,@tmp;
68
}
69
70
my %months = ('Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5,
71
	      'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' => 10,
72
	      'Nov' => 11, 'Dec' => 12);
73
74
$progname = $0;
75
$progname =~ s/.*[\/]//;
76
77
main();
78
79
####
80
#### main sub routine
81
####
82
83
sub main
84
{
85
  my ($connect_arg, @args, $ignored, @defops, $i);
86
87
  if (defined(my_which("my_print_defaults")))
88
  {
89
    @defops = `my_print_defaults mail_to_db`;
90
    chop @defops;
91
    splice @ARGV, 0, 0, @defops;
92
  }
93
  else
94
  {
95
    print "WARNING: No command 'my_print_defaults' found; unable to read\n";
96
    print "the my.cnf file. This command is available from the latest MySQL\n";
97
    print "distribution.\n";
98
  }
99
  GetOptions("help","version","host=s","port=i","socket=s","db=s",
100
	     "user=s","password=s","max_mail_size=i","create","test",
101
	     "no_path","debug","stop_on_error","stdin")
102
  || die "Wrong option! See $progname --help\n";
103
104
  usage($VER) if ($opt_help || $opt_version ||
105
		  (!$ARGV[0] && !$opt_create && !$opt_stdin));
106
107
  # Check that the given inbox files exist and are regular files
108
  for ($i = 0; ! $opt_stdin && defined($ARGV[$i]); $i++)
109
  {
110
    die "FATAL: Can't find inbox file: $ARGV[$i]\n" if (! -f $ARGV[$i]);
111
  }
112
113
  $connect_arg = "DBI:mysql:";
114
  push @args, "database=$opt_db" if defined($opt_db);
115
  push @args, "host=$opt_host" if defined($opt_host);
116
  push @args, "port=$opt_port" if defined($opt_port);
117
  push @args, "mysql_socket=$opt_socket" if defined($opt_socket);
118
  push @args, "mysql_read_default_group=mail_to_db";
119
  $connect_arg .= join ';', @args;
120
  $dbh = DBI->connect("$connect_arg", $opt_user, $opt_password,
121
		     { PrintError => 0})
122
  || die "Couldn't connect: $DBI::errstr\n";
123
124
  die "You must specify the database; use --db=" if (!defined($opt_db));
125
126
  create_table($dbh) if ($opt_create);
127
128
  if ($opt_stdin)
129
  {
130
    open(FILE, "-");
131
    process_mail_file($dbh, "READ-FROM-STDIN");
132
  }
133
  else
134
  {
135
    foreach (@ARGV)
136
    {
137
      # Check if the file is compressed
138
      if (/^(.*)\.(gz|Z)$/)
139
      {
140
	open(FILE, "zcat $_ |");
141
	process_mail_file($dbh, $1);
142
      }
143
      else
144
      {
145
	open(FILE, $_);
146
	process_mail_file($dbh, $_);
147
      }
148
    }
149
  }
150
  $dbh->disconnect if (!$opt_test);
151
152
  $ignored = ($mail_no_from_f + $mail_no_subject_f + $mail_no_txt_f +
153
	      $mail_too_big + $mail_duplicates + $mail_fixed);
154
  print "################################ Mail Report #################################\n\n";
155
  print "Mails inserted:\t\t\t\t\t$mail_inserted\n";
156
  print "---------------                                ";
157
  print "=" . "=" x length("$mail_inserted") . "=\n\n";
158
  if ($ignored)
159
  {
160
    print "Ignored mails\n";
161
    print "-------------\n";
162
    if ($mail_no_from_f)
163
    {
164
      print "Reason: mail without \"From:\" -field:\t\t$mail_no_from_f\n";
165
    }
166
    else
167
    {
168
      print "";
169
    }
170
    if ($mail_no_txt_f)
171
    {
172
      print "Reason: mail without message:\t\t\t$mail_no_txt_f\n";
173
    }
174
    else
175
    {
176
      print "";
177
    }
178
    if ($mail_no_subject_f)
179
    {
180
      print "Reason: mail without subject:\t\t\t$mail_no_subject_f\n";
181
    }
182
    else
183
    {
184
      print "";
185
    }
186
    if ($mail_too_big)
187
    {
188
      print "Reason: mail too big, over $opt_max_mail_size bytes:\t\t";
189
      print $mail_too_big;
190
      print " (see --max_mail_size=#)\n";
191
    }
192
    else
193
    {
194
      print "";
195
    }
196
    if ($mail_duplicates)
197
    {
198
      print "Reason: duplicate mail, or in db already:\t$mail_duplicates\n";
199
    }
200
    else
201
    {
202
      print "";
203
    }
204
    if ($mail_fixed)
205
    {
206
      print "Reason: mail was an unsubscribe - mail:\t\t$mail_fixed\n";
207
    }
208
    else
209
    {
210
      print "";
211
    }
212
    print "                                               ";
213
    print "=" . "=" x length("$ignored") . "=\n";
214
    print "Total number of ignored mails:\t\t\t$ignored\n\n";
215
  }
216
  print "Total number of mails:\t\t\t\t"; 
217
  print $mail_inserted + $ignored;
218
  print " (OK: ";
219
  print sprintf("%.1f", ($mail_inserted + $ignored) ? (($mail_inserted / ($mail_inserted+$ignored)) * 100) : 0.0);
220
  print "% Ignored: ";
221
  print sprintf("%.1f", ($mail_inserted + $ignored) ? (($ignored / ($mail_inserted + $ignored)) * 100) : 0);
222
  print "%)\n";
223
  print "################################ End Report ##################################\n";
224
  exit(0);
225
}
226
227
####
228
#### table creation
229
####
230
231
sub create_table
232
{
233
  my ($dbh)= @_;
234
  my ($sth, $query);
235
236
  $query= <<EOF;
237
CREATE TABLE my_mail
238
(
239
 mail_id MEDIUMINT UNSIGNED NOT NULL auto_increment,
240
 message_id VARCHAR(255),
241
 in_reply_to VARCHAR(255),
242
 date DATETIME NOT NULL,
243
 time_zone VARCHAR(20),
244
 mail_from VARCHAR(120) NOT NULL,
245
 reply VARCHAR(120),
246
 mail_to TEXT,
247
 cc TEXT,
248
 sbj VARCHAR(200),
249
 txt MEDIUMTEXT NOT NULL,
250
 file VARCHAR(64) NOT NULL,
251
 hash INTEGER NOT NULL,
252
 KEY (mail_id),
253
 KEY (message_id),
254
 KEY (in_reply_to),
255
 PRIMARY KEY (mail_from, date, hash))
256
 ENGINE=MyISAM COMMENT=''
257
EOF
258
  $sth = $dbh->prepare($query) or die $DBI::errstr;
259
  $sth->execute() or die "Couldn't create table: $DBI::errstr\n";
260
}
261
262
####
263
#### inbox processing. Can be either a real file, or standard input.
264
####
265
266
sub process_mail_file
267
{
268
  my ($dbh, $file_name) = @_;
269
  my (%values, $type, $check);
270
271
  $file_name =~ s/.*[\/]// if ($opt_no_path);
272
273
  %values = ();
274
  $type = "";
275
  $check = 0;
276
  while (<FILE>)
277
  {
278
    chop;
279
    chop if (substr($_, -1, 1) eq "\r");
280
    if ($type ne "message")
281
    { 
282
      if (/^Reply-To:\s*(.*)/i)
283
      {
284
	$type = "reply";
285
	$values{$type} = $1;
286
      }
287
      elsif (/^From: (.*)/i)
288
      {
289
	$type = "from";
290
	$values{$type} = $1;
291
      }
292
      elsif (/^To: (.*)/i)
293
      {
294
	$type = "to";
295
	$values{$type} = $1;
296
      }
297
      elsif (/^Cc: (.*)/i)
298
      {
299
	$type = "cc";
300
	$values{$type} = $1;
301
      }
302
      elsif (/^Subject: (.*)/i)
303
      {
304
	$type = "subject";
305
	$values{$type} = $1;
306
      }
307
      elsif (/^Message-Id:\s*(.*)/i)
308
      {
309
	$type = "message_id";
310
	s/^\s*(<.*>)\s*/$1/;
311
	$values{$type} = $1;
312
      }
313
      elsif (/^In-Reply-To:\s*(.*)/i)
314
      {
315
	$type = "in_reply_to";
316
	s/^\s*(<.*>)\s*/$1/;
317
	$values{$type} = $1;
318
      }
319
      elsif (/^Date: (.*)/i)
320
      {
321
	date_parser($1, \%values, $file_name);
322
	$type = "rubbish";
323
      }
324
      # Catch those fields that we don't or can't handle (yet)
325
      elsif (/^[\w\W-]+:/)
326
      {
327
	$type = "rubbish";
328
      }
329
      elsif ($_ eq "")
330
      { 
331
	$type = "message";
332
	$values{$type} = "";
333
      }
334
      else
335
      {
336
	s/^\s*/ /;
337
	if ($type eq 'message_id' || $type eq 'in_reply_to')
338
	{
339
	  s/^\s*(<.*>)\s*/$1/;
340
	}
341
	$values{$type} .= $_;
342
      }
343
    }
344
    elsif ($check != 0 && $_ ne "") # in case of forwarded messages
345
    {
346
      $values{$type} .= "\n" . $_;
347
      $check--;
348
    }
349
    elsif (/^From .* \d\d:\d\d:\d\d\s\d\d\d\d/ ||
350
           /^From .* \d\d\d\d\s\d\d:\d\d:\d\d/)
351
    {
352
      $values{'hash'} = checksum("$values{'message'}");
353
      update_table($dbh, $file_name, \%values);
354
      %values = ();
355
      $type = "";
356
      $check = 0;
357
    }
358
    elsif (/-* forwarded message .*-*/i) # in case of forwarded messages
359
    {
360
      $values{$type} .= "\n" . $_;
361
      $check++;
362
      $mail_forwarded++;
363
    }
364
    else
365
    {
366
      $values{$type} .= "\n" . $_;
367
    }
368
  }
369
  if (defined($values{'message'}))
370
  {
371
    $values{'hash'} = checksum("$values{'message'}");
372
    update_table($dbh, $file_name, \%values);
373
  }
374
}
375
376
####
377
#### get date and timezone
378
####
379
380
sub date_parser
381
{
382
  my ($date_raw, $values, $file_name, $tmp) = @_;
383
384
  # If you ever need to change this test, be especially careful with
385
  # the timezone; it may be just a number (-0600), or just a name (EET), or
386
  # both (-0600 (EET), or -0600 (EET GMT)), or without parenthesis: GMT.
387
  # You probably should use a 'greedy' regexp in the end
388
  $date_raw =~ /^\D*(\d{1,2})\s+(\w+)\s+(\d{2,4})\s+(\d+:\d+)(:\d+)?\s*(\S+.*)?/;
389
390
  if (!defined($1) || !defined($2) || !defined($3) || !defined($4) ||
391
      !defined($months{$2}))
392
  {
393
    if ($opt_debug || $opt_stop_on_error)
394
    {
395
      print "FAILED: date_parser: 1: $1 2: $2 3: $3 4: $4 5: $5\n";
396
      print "months{2}: $months{$2}\n";
397
      print "date_raw: $date_raw\n";
398
      print "Inbox filename: $file_name\n";
399
    }
400
    exit(1) if ($opt_stop_on_error);
401
    $values->{'date'} = "";
402
    $values->{'time_zone'} = "";
403
    return;
404
  }
405
  $tmp = $3 . "-" . $months{$2} . "-" . "$1 $4";
406
  $tmp.= defined($5) ? $5 : ":00";
407
  $values->{'date'} = $tmp;
408
  print "INSERTING DATE: $tmp\n" if ($opt_debug);
409
  $values->{'time_zone'} = $6;
410
}
411
412
####
413
#### Insert to table
414
#### 
415
416
sub update_table
417
{
418
  my($dbh, $file_name, $values) = @_;
419
  my($q, $tail, $message);
420
421
  if (!defined($values->{'subject'}) || !defined($values->{'to'}))
422
  {
423
    $mail_no_subject_f++;
424
    return;			# Ignore these
425
  }
426
  $message = $values->{'message'};
427
  $message =~ s/^\s*//; # removes whitespaces from the beginning 
428
429
 restart:
430
  $message =~ s/[\s\n>]*$//; # removes whitespaces and '>' from the end
431
  $values->{'message'} = $message;
432
  foreach $tail (@remove_tail)
433
  {
434
    $message =~ s/$tail//;
435
  }
436
  if ($message ne $values->{'message'})
437
  {
438
    $message =~ s/\s*$//; # removes whitespaces from the end
439
    $mail_fixed++;
440
    goto restart;	  # Some mails may have duplicated messages
441
  }
442
443
  $q = "INSERT INTO my_mail (";
444
  $q.= "mail_id,";
445
  $q.= "message_id,";
446
  $q.= "in_reply_to,";
447
  $q.= "date,";
448
  $q.= "time_zone,";
449
  $q.= "mail_from,";
450
  $q.= "reply,";
451
  $q.= "mail_to,";
452
  $q.= "cc,";
453
  $q.= "sbj,";
454
  $q.= "txt,";
455
  $q.= "file,";
456
  $q.= "hash";
457
  $q.= ") VALUES (";
458
  $q.= "NULL,";
459
  $q.= (defined($values->{'message_id'}) ?
460
	$dbh->quote($values->{'message_id'}) : "NULL");
461
  $q.= ",";
462
  $q.= (defined($values->{'in_reply_to'}) ?
463
	$dbh->quote($values->{'in_reply_to'}) : "NULL");
464
  $q.= ",";
465
  $q.= "'" . $values->{'date'} . "',";
466
  $q.= (defined($values->{'time_zone'}) ?
467
	$dbh->quote($values->{'time_zone'}) : "NULL");
468
  $q.= ",";
469
  $q.= defined($values->{'from'}) ? $dbh->quote($values->{'from'}) : "NULL";
470
  $q.= ",";
471
  $q.= defined($values->{'reply'}) ? $dbh->quote($values->{'reply'}) : "NULL";
472
  $q.= ",";
473
  $q.= defined($values->{'to'}) ? $dbh->quote($values->{'to'}) : "NULL";
474
  $q.= ",";
475
  $q.= defined($values->{'cc'}) ? $dbh->quote($values->{'cc'}) : "NULL"; 
476
  $q.= ","; 
477
  $q.= $dbh->quote($values->{'subject'});
478
  $q.= ",";
479
  $q.= $dbh->quote($message);
480
  $q.= ",";
481
  $q.= $dbh->quote($file_name);
482
  $q.= ",";
483
  $q.= "'" . $values->{'hash'} . "'";
484
  $q.= ")";
485
486
  # Don't insert mails bigger than $opt_max_mail_size
487
  if (length($message) > $opt_max_mail_size)
488
  {
489
    $mail_too_big++;
490
  }
491
  # Don't insert mails without 'From' field
492
  elsif (!defined($values->{'from'}) || $values->{'from'} eq "")
493
  {
494
    $mail_no_from_f++;
495
  }
496
  elsif ($opt_test)
497
  {
498
    print "$q\n";
499
    $mail_inserted++;
500
  }
501
  # Don't insert mails without the 'message'
502
  elsif ($message eq "") 
503
  {
504
    $mail_no_txt_f++;
505
  }
506
  elsif ($dbh->do($q))
507
  {
508
    $mail_inserted++;
509
  }
510
  # This should never happen. This means that the above q failed,
511
  # but it wasn't because of a duplicate mail entry
512
  elsif (!($DBI::errstr =~ /Duplicate entry /))
513
  {
514
    die "FATAL: Got error :$DBI::errstr\nAttempted query was: $q\n";
515
  }
516
  else
517
  {
518
    $mail_duplicates++;
519
    print "Duplicate mail: query: $q\n" if ($opt_debug);
520
  }
521
  $q = "";
522
}
523
524
####
525
#### In case you have two identical messages we wanted to identify them
526
#### and remove additionals;  We do this by calculating a hash number of the
527
#### message and ignoring messages with the same from, date and hash.
528
#### This function calculates a simple 32 bit hash value for the message.
529
####
530
531
sub checksum
532
{
533
  my ($txt)= @_;
534
  my ($crc, $i, $count);
535
  $count = length($txt);
536
  for ($crc = $i = 0; $i < $count ; $i++)
537
  {
538
    $crc = (($crc << 1) + (ord (substr ($txt, $i, 1)))) +
539
      (($crc & (1 << 30)) ? 1 : 0);
540
    $crc &= ((1 << 31) -1);
541
  }
542
  return $crc;
543
}
544
545
####
546
#### my_which is used, because we can't assume that every system has the
547
#### which -command. my_which can take only one argument at a time.
548
#### Return values: requested system command with the first found path,
549
#### or undefined, if not found.
550
####
551
552
sub my_which
553
{
554
  my ($command) = @_;
555
  my (@paths, $path);
556
557
  return $command if (-f $command && -x $command);
558
  @paths = split(':', $ENV{'PATH'});
559
  foreach $path (@paths)
560
  {
561
    $path = "." if ($path eq "");
562
    $path .= "/$command";
563
    return $path if (-f $path && -x $path);
564
  }
565
  return undef();
566
}
567
568
####
569
#### usage and version
570
####
571
572
sub usage
573
{  
574
  my ($VER)= @_;
575
  
576
  if ($opt_version)
577
  {
578
    print "$progname version $VER\n";
579
  } 
580
  else
581
  {
582
    print <<EOF;
583
$progname version $VER
584
585
Description: Insert mails from inbox file(s) into a table. This program 
586
can read group [mail_to_db] from the my.cnf file. You may want to have db
587
and table set there at least.
588
589
Usage: $progname [options] file1 [file2 file3 ...]
590
or:    $progname [options] --create [file1 file2...]
591
or:    cat inbox | $progname [options] --stdin
592
593
The last example can be used to read mails from standard input and can
594
useful when inserting mails to database via a program 'on-the-fly'.
595
The filename will be 'READ-FROM-STDIN' in this case.
596
597
Options:
598
--help             Show this help and exit.
599
--version          Show the version number and exit.
600
--debug            Print some extra information during the run.
601
--host=...         Hostname to be used.
602
--port=#           TCP/IP port to be used with connection.
603
--socket=...       MySQL UNIX socket to be used with connection.
604
--db=...           Database to be used.
605
--user=...         Username for connecting.
606
--password=...     Password for the user.
607
--stdin            Read mails from stdin.
608
--max_mail_size=#  Maximum size of a mail in bytes.
609
                   Beware of the downside letting this variable be too big;
610
                   you may easily end up inserting a lot of attached 
611
                   binary files (like MS Word documents etc), which take
612
                   space, make the database slower and are not really
613
                   searchable anyway. (Default $opt_max_mail_size)
614
--create           Create the mails table. This can be done with the first run.
615
--test		   Dry run. Print the queries and the result as it would be.
616
--no_path          When inserting the file name, leave out any paths of
617
                   the name.
618
--stop_on_error    Stop the run, if an unexpected, but not fatal error occurs
619
                   during the run. Without this option some fields may get
620
                   unwanted values. --debug will also report about these.
621
EOF
622
  }
623
  exit(0);
624
}