4 a) our mailfort email database
5 point it at the top directory, containing YEAR/MONTH/DAY.... directories.
6 scan each file (over a year old...)
7 extract out the attachment, and replace with HTML
8 DATABASE? - mysql or sqlite? -
9 filesize / name / date / checksum / mimetype -- into mailfort should be OK.
10 b) the imap user emails
11 loop through user's directories
12 check age of email .. over 1 years..
13 ?? how to prevent 'repeat' scanning of emails?
14 ??? hidden '.' files containing last scan date?
16 check if file exists in our DB.. - replace the link...
17 otherwise generate a file. + add to DB...
22 -> URL -> redirect to correct server
25 More notes on our Mailfort DB sync:
26 * some of these attachments are already in the database...
27 - so we need to update the DB..
28 - probably worth putting the code in a stored procedure..
31 * first scan (and extract)
32 * rescan (as I messed up the first time - fix the DB...)
33 * email scan - attachments might not have related messages.
36 - {id} attachment_init(
41 // creates or returns id (can look for existing messages?
42 // can do a merge?? - copy 'old' record data into 'new'.... "prefer checksummed"
49 {created} // message date..
54 attachment_update_store(
62 // valac --pkg gmime --vapi
65 // http://www.fromdual.com/mysql-vala-program-example << check mysql if this does not work.
67 valac -g --vapidir=. --thread strip.vala --vapidir=../vapi \
68 --pkg glib-2.0 --pkg mysql --pkg gio-2.0 --pkg posix --pkg gmime-2.6 \
69 --Xcc=-lmysqlclient -v \
73 public class StripApplication : GLib.Application {
75 public static string? opt_path = null;
76 public static string? opt_file = null;
77 public static string? opt_target_path = null;
78 public static string? opt_db_host = "127.0.0.1";
79 public static string? opt_db_name = null;
80 public static string? opt_db_user = null;
81 public static string? opt_db_pass = null;
83 public static int opt_limit = -1;
85 public static bool opt_is_extracting = false;
86 public static bool opt_is_replacing = false;
87 public static bool opt_scan_imap = false;
88 public static bool opt_scan_mailfort = false;
89 public static bool opt_dump = false;
90 public static bool opt_debug = false;
92 public static bool opt_debug_sql = false;
93 public static string? opt_replace_link = null;
96 public const GLib.OptionEntry[] options = {
98 { "debug", 0, 0, OptionArg.NONE, ref opt_debug, "show debug messages for components", null },
99 { "debug-sql", 0, 0, OptionArg.NONE, ref opt_debug_sql, "debug the SQL statements", null },
101 { "path", 0, 0, OptionArg.STRING, ref opt_path, "Directory where email to be parsed is", null },
102 { "file", 0, 0, OptionArg.STRING, ref opt_file, "A specific file to be parsed", null },
104 { "target-path", 0, 0, OptionArg.STRING, ref opt_target_path, "Directory where attachments are to be put", null },
106 { "link", 0, 0, OptionArg.STRING, ref opt_replace_link, "url for the replement link: eg. http://www.mysite.com/xxxx/%s", null },
108 { "host", 0, 0, OptionArg.STRING, ref opt_db_host, "Mysql host (default localhost)", null },
109 { "name", 0, 0, OptionArg.STRING, ref opt_db_name, "Mysql database name REQUIRED", null },
110 { "user", 0, 0, OptionArg.STRING, ref opt_db_user, "Mysql database user REQUIRED", null },
111 { "pass", 0, 0, OptionArg.STRING, ref opt_db_pass, "Mysql database password (default empty)", null },
113 { "extract", 0, 0, OptionArg.NONE, ref opt_is_extracting, "Should attachments be extracted (default NO)", null },
114 { "replace", 0, 0, OptionArg.NONE, ref opt_is_replacing, "Should attachments be replaced (default NO)", null },
115 { "dump", 0, 0, OptionArg.NONE, ref opt_dump, "Print the replaced mail contents to stdout", null },
117 { "limit", 0, 0, OptionArg.INT, ref opt_limit, "stop after X number of messages with attachments have been processed", null },
119 { "scan-imap", 0, 0, OptionArg.NONE, ref opt_scan_imap, "scan an imap tree", null },
120 { "scan-mailfort", 0, 0, OptionArg.NONE, ref opt_scan_mailfort, "scan a mailfort tree", null },
123 public StripApplication( string[] args )
126 application_id: "org.roojs.mailstripper",
127 flags: ApplicationFlags.FLAGS_NONE
131 var opt_context = new GLib.OptionContext ("Mail Stripper");
135 opt_context.set_help_enabled (true);
136 opt_context.add_main_entries (options, null);
137 opt_context.parse ( ref args);
138 //opt_detach = !optx_no_detach;
142 // options that have to be set.. bee or hive... (or stop all)
143 if ((!opt_scan_mailfort && !opt_scan_imap) || (opt_scan_mailfort && opt_scan_imap)) {
144 stdout.printf ("You must specify the type of directory tree to scan - either imap or mailfort\n%s",
145 opt_context.get_help(true, null));
146 GLib.Process.exit(Posix.EXIT_FAILURE);
149 if ((opt_db_name == null || opt_db_name.length < 1 || opt_db_user == null || opt_db_user.length < 1)) {
150 stdout.printf ("You must specify the database name / user \n%s",
151 opt_context.get_help(true, null));
152 GLib.Process.exit(Posix.EXIT_FAILURE);
154 if ((opt_path == null || opt_path.length < 1) ) {
155 stdout.printf ("You must specify the scan start path\n%s",
156 opt_context.get_help(true, null));
157 GLib.Process.exit(Posix.EXIT_FAILURE);
159 if (opt_replace_link == null || (opt_replace_link.length < 1)) {
160 stdout.printf ("You must specify the link to use in the replacement \n%s",
161 opt_context.get_help(true, null));
162 GLib.Process.exit(Posix.EXIT_FAILURE);
164 if ((opt_is_replacing || opt_is_extracting ) && (opt_target_path == null || opt_target_path.length < 1)) {
165 stdout.printf ("You must specify a target path to put attachments\n%s",
166 opt_context.get_help(true, null));
167 GLib.Process.exit(Posix.EXIT_FAILURE);
171 } catch (GLib.OptionError e) {
172 stdout.printf ("error: %s\n", e.message);
173 stdout.printf ("Run '%s --help' to see a full list of available command line options.\n%s",
174 args[0], opt_context.get_help(true, null));
175 GLib.Process.exit(Posix.EXIT_FAILURE);
179 public static int main(string[] args)
182 var application = new StripApplication( args);
184 GLib.Log.set_always_fatal(LogLevelFlags.LEVEL_ERROR | LogLevelFlags.LEVEL_CRITICAL);
186 if (opt_debug || opt_debug_sql) {
187 GLib.Log.set_handler(null,
188 GLib.LogLevelFlags.LEVEL_DEBUG | GLib.LogLevelFlags.LEVEL_WARNING | GLib.LogLevelFlags.LEVEL_INFO,
196 if (StripApplication.opt_is_replacing) {
197 StripApplication.opt_is_extracting = true;
200 GLib.debug("scanning folder: %s", opt_path );
202 var strip = new Strip( opt_path );
205 strip.mysql = new Mysql.Database();
206 if (!strip.mysql.real_connect(
209 opt_db_pass == null ? "" : opt_db_pass, //passwd
211 3306, // not changable...?
215 stdout.printf("ERROR %u: Connection failed: %s\n",
216 strip.mysql.errno(), strip.mysql.error()
220 if (opt_file != null) {
221 strip.base_dir = opt_path;
222 strip.scan_file( GLib.Path.get_dirname(opt_file), GLib.Path.get_basename(opt_file));
226 strip.scan_dir(opt_path);
234 public class Strip : GLib.Object {
238 public string base_dir = "";
240 public Mysql.Database mysql;
244 uint64 used_space_before = 0;
245 uint64 used_space_after = 0;
248 public Strip(string base_dir)
250 this.base_dir = base_dir;
253 public void handle_part(GMime.Object parent, GMime.Object mime_obj)
255 if (mime_obj is GMime.Part) {
256 var p = (GMime.Part)mime_obj;
257 var ct = p.get_content_type();
258 var cd = p.get_content_disposition();
260 var sid = p.get_header("X-strip-id");
261 if (sid != null && sid.length > 0) {
262 this.update_attachment_db(p);
263 GLib.debug("Skip attachment replace - it's already been done");
267 if (cd == null || cd.get_disposition().down() != "attachment") {
270 if (ct.get_media_type() == "text") {
273 if (ct.to_string() == "application/pgp-encrypted") {
276 if (ct.to_string() == "application/pgp-keys") {
279 if (p.get_filename() == null) {
282 // print("got part %s\n", ct.to_string());
283 if (parent is GMime.Multipart) {
285 this.replace_attachment(((GMime.Multipart)parent), p);
293 if (mime_obj is GMime.Multipart) {
296 var mp = (GMime.Multipart)mime_obj;
297 //var ct = mp.get_content_type();
299 //print("got multi-part %s\n", ct.to_string());
300 for (var i = 0; i< mp.get_count(); i++) {
301 var mo = mp.get_part(i);
302 this.handle_part(mime_obj,mo);
304 // ((GMime.Multipart)mime_obj).foreach((sub_obj) => {
305 // Strip.handle_part(sub_obj);
313 if (mime_obj is GMime.MessagePart) {
314 var msg = ((GMime.MessagePart)mime_obj).get_message();
315 msg.foreach((subobj) => {
316 this.handle_part(msg,subobj);
319 //print("got message-part\n");
323 if (mime_obj is GMime.Message) {
324 var mp = ((GMime.Message) mime_obj).get_mime_part();
326 if (!(mp is GMime.Multipart)) {
327 GLib.debug("get mimepart does not return a Multipart?");
331 var mpc = ((GMime.Multipart)mp).get_count();
333 //GLib.debug("Message has %d parts", mpc);
334 for (var i =0 ; i < mpc; i++) {
335 //GLib.debug("Getting part %d", i);
336 var submime_obj = ((GMime.Multipart)mp).get_part(i);
337 this.handle_part(mp,submime_obj);
339 print("got message??\n");
343 print("got something else\n");
347 public void update_attachment_db(GMime.Part attachment)
349 // only called when we have an sid...
350 var sid = attachment.get_header("X-strip-id");
351 if (sid == null || sid.length < 1) {
352 GLib.debug("Strange - update attachment db called ?");
356 SELECT attachment_update(
357 %d, -- in_id INT(11),
358 '%s', -- in_mime_type varchar(255),
359 %d, -- in_mime_size int(11),
360 '%s', -- in_created DATETIME,
361 '%s' -- in_mailfort_sig varchar(64)
368 "", // this will be ignored..
369 0, // this will be ingored..
371 this.mysql_escape(this.active_message_x_mailfort_sig)
381 public void replace_attachment(GMime.Multipart parent, GMime.Part attachment)
383 var sid = attachment.get_header("X-strip-id");
384 if (sid != null && sid.length > 0) {
385 GLib.debug("Skip attachment replace - it's already been done");
389 var c = attachment.get_content_object();
391 var filename = attachment.get_filename().replace("/", "-");
392 var fn = GLib.Environment.get_tmp_dir() +
393 "/"+ this.active_name + "."+ filename;
395 var outfile = new GMime.StreamFile.for_path(fn, "w");
396 outfile.set_owner(true);
397 var file_size = (int) c.write_to_stream(outfile);
398 var chksum = this.md5_file(fn);
402 if (file_size == 0) {
404 GLib.debug("ERROR - file size of write to stream returned 0?");
412 var mime_type= attachment.get_content_type().to_string();
413 // at this point we have to do our database magic...
414 //filesize / name / date / checksum / mimetype -- into mailfort should be OK.
420 '%s', -- in_msgid VARCHAR(32),
421 '%s', -- in_checksum VARCHAR(64),
422 '%s', -- in_mime_filename varchar(255)
427 this.mysql_escape(this.active_message_exim_id),
429 this.mysql_escape( attachment.get_filename() ), // what is thsi is invalid?
433 var rs = mysql.use_result();
435 var row = rs.fetch_row();
439 //var create_dir = this.created_dir;
442 if (rs == null || rs.num_rows() < 1) {
443 GLib.debug("ERROR - CALL to attachment_init failed");
449 if (int.parse(file_id) < 1) {
450 GLib.debug("ERROR - CALL to attachment_init failed - returned 0?");
457 GLib.debug("fn = %s, m5=%s, id= %s", filename, mime_type, this.active_message_id);
460 SELECT attachment_update(
461 %d, -- in_id INT(11),
462 '%s', -- in_mime_type varchar(255),
463 '%s', -- in_created DATETIME,
464 '%s', -- in_mailfort_sig varchar(64)
469 this.mysql_escape(mime_type),
471 this.mysql_escape(this.active_message_x_mailfort_sig)
476 this.used_space_after += file_size;
479 file_id = "%d".printf((int)file_id_real);
483 if (StripApplication.opt_is_extracting) {
484 target_fn = StripApplication.opt_target_path + "/" + this.created_dir +"/"+ file_id + "-" + filename;
487 var stored = "/" + this.created_dir +"/"+ file_id + "-" + filename;
493 stored_filename = '%s'
497 this.mysql_escape( stored),
503 var rep = new GMime.Part.with_type("text","html");
504 // we have to set up a redirect server - to redirect hpasite... to their internal service..
505 rep.set_filename(filename);
506 var txt = "<html><body>"+
507 "<a href=\"" + StripApplication.opt_replace_link + "/" +
508 file_id + "/" + create_dir + "/"+chksum+"/"+ GLib.Uri.escape_string( filename) +"\">" +
509 GLib.Uri.escape_string( filename) + // fixme needs html escaping...
513 rep.get_content_type().set_parameter("charset", "utf-8");
514 rep.set_header("X-strip-id", file_id);
515 rep.set_header("X-strip-content-name", filename);
516 rep.set_header("X-strip-path", create_dir + "/" + file_id + "-" + filename);
517 rep.set_header("X-strip-content-type", mime_type);
518 var stream = new GMime.StreamMem.with_buffer(txt.data);
519 var con = new GMime.DataWrapper.with_stream(stream,GMime.ContentEncoding.DEFAULT);
521 rep.set_content_object(con);
522 GLib.debug("Replacing Attachment with HTML");
523 parent.replace(parent.index_of(attachment), rep);
524 this.has_replaced = true;
526 if (StripApplication.opt_is_extracting && target_fn.length > 0) {
527 var dir = GLib.Path.get_dirname(target_fn);
528 if (!FileUtils.test (dir, FileTest.IS_DIR)) {
529 GLib.DirUtils.create_with_parents(dir, 0755);
531 GLib.debug("Creating file %s", target_fn);
532 if (!FileUtils.test (target_fn, FileTest.EXISTS)) {
533 Posix.link(fn, target_fn);
536 GLib.debug("Skipping extraction %s", target_fn);
544 public int query(string str)
546 if (StripApplication.opt_debug_sql) {
547 GLib.debug("SQL: %s\n", str);
550 var rc= this.mysql.query(str);
553 GLib.debug("ERROR %u: Query failed: %s\n", this.mysql.errno(), this.mysql.error());
558 public string mysql_escape(string str)
560 unichar[] value_escaped = new unichar[str.length * 2 + 1];
561 this.mysql.real_escape_string ((string) value_escaped, str, str.length);
562 return (string) value_escaped;
565 public string md5_file(string fn) {
566 Checksum checksum = new Checksum (ChecksumType.MD5);
568 FileStream stream = FileStream.open (fn, "rb");
572 while ((size = stream.read (fbuf)) > 0) {
573 checksum.update (fbuf, size);
576 unowned string digest = checksum.get_string ();
580 string active_path = "";
581 string active_name = "";
582 string active_message_id = "";
583 string active_message_x_mailfort_sig = "";
584 string active_message_exim_id = "";
585 bool has_replaced = false;
586 string created_date = ""; // should be YYYY-mm-dd
587 string created_dir = ""; // should be YYY/mm/dd
589 public void scan_file(string path, string name)
591 GLib.debug("Scan: %s/%s", path,name);
593 this.has_replaced = false;
594 this.active_path = path;
595 this.active_name = name;
596 this.active_message_id = "";
598 this.created_dir = this.active_path.substring(this.base_dir.length + 1 );
599 this.created_date = this.created_dir.replace("/", "-");
601 var file_size = (int) File.new_for_path(path +"/" + name)
602 .query_info(GLib.FileAttribute.STANDARD_SIZE,GLib.FileQueryInfoFlags.NONE,null)
605 this.used_space_before += file_size;
607 var stream = new GMime.StreamFs.for_path (path +"/" + name,Posix.O_RDONLY, 0);
608 //stream.set_owner(true);
609 var parser = new GMime.Parser.with_stream(stream);
610 var message = parser.construct_message();
612 if (message == null) {
613 GLib.debug("Could not parse file? %s/%s", path,name);
614 this.used_space_after += file_size;
619 // check : - is message over a year old?
620 // get various msg info..
621 this.active_message_id = message.get_message_id();
622 this.active_message_x_mailfort_sig = message.get_header("x-mailfort-sig");
623 var recvd = message.get_header("received");
624 this.active_message_exim_id = "";
625 if (recvd != null && recvd.length > 1) {
626 GLib.debug("RECV: %s", recvd);
627 var lines = recvd.split("\t");
628 for (var i = 0; i < lines.length;i++) {
629 var bits = lines[i].strip().split(" ");
630 if (bits[0] == "id") {
631 this.active_message_exim_id = bits[1];
635 GLib.debug("Message DATA:\n mid: %s\nmailfort: %s \nexim_id: %s",
636 this.active_message_id,
637 this.active_message_x_mailfort_sig,
638 this.active_message_exim_id
644 var mp = message.get_mime_part();
646 if (!(mp is GMime.Multipart)) {
647 GLib.debug("get mimepart does not return a Multipart?");
648 this.used_space_after += file_size;
652 var mpc = ((GMime.Multipart)mp).get_count();
654 //GLib.debug("Message has %d parts", mpc);
655 for (var i =0 ; i < mpc; i++) {
656 //GLib.debug("Getting part %d", i);
657 var mime_obj = ((GMime.Multipart)mp).get_part(i);
658 this.handle_part(mp,mime_obj);
663 // stream.set_owner(false);
665 stream = null;//.close();
668 if (!this.has_replaced) {
669 this.used_space_after += file_size;
670 GLib.debug("skpping write file - no replacement occured");
674 GMime.Stream outstream = new GMime.StreamNull();
675 if (StripApplication.opt_is_replacing) {
677 tmpfile = GLib.Environment.get_tmp_dir() +"/" + name;
678 outstream = new GMime.StreamFile.for_path (tmpfile,"w");
679 ((GMime.StreamFile)outstream).set_owner(true);
681 if (StripApplication.opt_dump) {
682 outstream = new GMime.StreamMem();
685 file_size = (int) message.write_to_stream(outstream);
686 if (StripApplication.opt_is_replacing) {
687 ((GMime.StreamFile)outstream).set_owner(false);
689 if (StripApplication.opt_dump) {
690 var ua = ((GMime.StreamMem)outstream).get_byte_array().data;
691 print("%s\n", (string) ua);
696 GLib.debug("finished writing output %d", file_size);
701 this.used_space_after += file_size;
704 if (StripApplication.opt_is_replacing) {
705 Posix.unlink(path +"/" + name);
706 GLib.debug("copy tmp file %s to %s" , tmpfile, path +"/" + name);
707 Posix.link(tmpfile, path +"/" + name);
708 Posix.unlink(tmpfile);
712 if (StripApplication.opt_limit > -1 && this.processed >= StripApplication.opt_limit) {
713 GLib.debug("Reached replacement limit");
723 public void scan_dir(string path)
725 var f = File.new_for_path(path);
726 FileEnumerator file_enum;
727 var cancellable = new Cancellable ();
729 file_enum = f.enumerate_children(
730 FileAttribute.STANDARD_DISPLAY_NAME + "," + FileAttribute.STANDARD_TYPE,
731 FileQueryInfoFlags.NOFOLLOW_SYMLINKS, // FileQueryInfoFlags.NONE,
735 GLib.debug("Got error scanning dir? %s", e.message);
736 // FIXME - show error..
741 while (cancellable.is_cancelled () == false ) {
743 next_file = file_enum.next_file (cancellable);
745 GLib.debug("error getting next file? %s", e.message);
749 if (next_file == null) {
754 if (next_file.get_file_type() != FileType.DIRECTORY) {
756 if (next_file.get_display_name()[0] == ',') {
760 this.scan_file(path , next_file.get_display_name());
761 this.report_state("After scanning %s/%s".printf(path , next_file.get_display_name()));
766 //stdout.printf("Monitor.monitor: got file %s : type :%u\n",
767 // next_file.get_display_name(), next_file.get_file_type());
770 if (next_file.get_is_symlink()) {
775 var ds = next_file.get_display_name();
780 if (ds == "attachments") {
785 var sp = path+"/"+next_file.get_display_name();
787 //print("got a file : " + sp);
798 void report_state(string msg)
800 // Saved: 2G Original 10G : 20%
801 GLib.debug("Saved : %s (%.1f%%) | Original %s | %s",
802 GLib.format_size(this.used_space_before - this.used_space_after),
803 100f * ((1f * (this.used_space_before - this.used_space_after)) / (this.used_space_before * 1f)),
804 GLib.format_size(this.used_space_before),