4 a) our mailfort email database
5 point it at the top directory, containing YEAR/MONTH/DAY.... directories.
6 scan each file (over a year old...)
7 extract out the attachment, and replace with HTML
8 DATABASE? - mysql or sqlite? -
9 filesize / name / date / checksum / mimetype -- into mailfort should be OK.
10 b) the imap user emails
11 loop through user's directories
12 check age of email .. over 1 years..
13 ?? how to prevent 'repeat' scanning of emails?
14 ??? hidden '.' files containing last scan date?
16 check if file exists in our DB.. - replace the link...
17 otherwise generate a file. + add to DB...
22 -> URL -> redirect to correct server
25 More notes on our Mailfort DB sync:
26 * some of these attachments are already in the database...
27 - so we need to update the DB..
28 - probably worth putting the code in a stored procedure..
31 * first scan (and extract)
32 * rescan (as I messed up the first time - fix the DB...)
33 * email scan - attachments might not have related messages.
36 - {id} attachment_init(
41 // creates or returns id (can look for existing messages?
42 // can do a merge?? - copy 'old' record data into 'new'.... "prefer checksummed"
49 {created} // message date..
54 attachment_update_store(
62 // valac --pkg gmime --vapi
65 // http://www.fromdual.com/mysql-vala-program-example << check mysql if this does not work.
67 valac -g --vapidir=. --thread strip.vala --vapidir=../vapi \
68 --pkg glib-2.0 --pkg mysql --pkg gio-2.0 --pkg posix --pkg gmime-2.6 \
69 --Xcc=-lmysqlclient -v \
73 public class StripApplication : GLib.Application {
75 public static string? opt_path = null;
76 public static string? opt_file = null;
77 public static string? opt_target_path = null;
78 public static string? opt_db_host = "127.0.0.1";
79 public static string? opt_db_name = null;
80 public static string? opt_db_user = null;
81 public static string? opt_db_pass = null;
83 public static int opt_limit = -1;
85 public static bool opt_is_extracting = false;
86 public static bool opt_is_replacing = false;
87 public static bool opt_scan_imap = false;
88 public static bool opt_scan_mailfort = false;
89 public static bool opt_dump = false;
90 public static bool opt_debug = false;
92 public static bool opt_debug_sql = false;
93 public static string? opt_replace_link = null;
96 public const GLib.OptionEntry[] options = {
98 { "debug", 0, 0, OptionArg.NONE, ref opt_debug, "show debug messages for components", null },
99 { "debug-sql", 0, 0, OptionArg.NONE, ref opt_debug_sql, "debug the SQL statements", null },
101 { "path", 0, 0, OptionArg.STRING, ref opt_path, "Directory where email to be parsed is", null },
102 { "file", 0, 0, OptionArg.STRING, ref opt_file, "A specific file to be parsed", null },
104 { "target-path", 0, 0, OptionArg.STRING, ref opt_target_path, "Directory where attachments are to be put", null },
106 { "link", 0, 0, OptionArg.STRING, ref opt_replace_link, "url for the replement link: eg. http://www.mysite.com/xxxx/%s", null },
108 { "host", 0, 0, OptionArg.STRING, ref opt_db_host, "Mysql host (default localhost)", null },
109 { "name", 0, 0, OptionArg.STRING, ref opt_db_name, "Mysql database name REQUIRED", null },
110 { "user", 0, 0, OptionArg.STRING, ref opt_db_user, "Mysql database user REQUIRED", null },
111 { "pass", 0, 0, OptionArg.STRING, ref opt_db_pass, "Mysql database password (default empty)", null },
113 { "extract", 0, 0, OptionArg.NONE, ref opt_is_extracting, "Should attachments be extracted (default NO)", null },
114 { "replace", 0, 0, OptionArg.NONE, ref opt_is_replacing, "Should attachments be replaced (default NO)", null },
115 { "dump", 0, 0, OptionArg.NONE, ref opt_dump, "Print the replaced mail contents to stdout", null },
117 { "limit", 0, 0, OptionArg.INT, ref opt_limit, "stop after X number of messages with attachments have been processed", null },
119 { "scan-imap", 0, 0, OptionArg.NONE, ref opt_scan_imap, "scan an imap tree", null },
120 { "scan-mailfort", 0, 0, OptionArg.NONE, ref opt_scan_mailfort, "scan a mailfort tree", null },
123 public StripApplication( string[] args )
126 application_id: "org.roojs.mailstripper",
127 flags: ApplicationFlags.FLAGS_NONE
131 var opt_context = new GLib.OptionContext ("Mail Stripper");
135 opt_context.set_help_enabled (true);
136 opt_context.add_main_entries (options, null);
137 opt_context.parse ( ref args);
138 //opt_detach = !optx_no_detach;
142 // options that have to be set.. bee or hive... (or stop all)
143 if ((!opt_scan_mailfort && !opt_scan_imap) || (opt_scan_mailfort && opt_scan_imap)) {
144 stdout.printf ("You must specify the type of directory tree to scan - either imap or mailfort\n%s",
145 opt_context.get_help(true, null));
146 GLib.Process.exit(Posix.EXIT_FAILURE);
149 if ((opt_db_name == null || opt_db_name.length < 1 || opt_db_user == null || opt_db_user.length < 1)) {
150 stdout.printf ("You must specify the database name / user \n%s",
151 opt_context.get_help(true, null));
152 GLib.Process.exit(Posix.EXIT_FAILURE);
154 if ((opt_path == null || opt_path.length < 1) ) {
155 stdout.printf ("You must specify the scan start path\n%s",
156 opt_context.get_help(true, null));
157 GLib.Process.exit(Posix.EXIT_FAILURE);
159 if (opt_replace_link == null || (opt_replace_link.length < 1)) {
160 stdout.printf ("You must specify the link to use in the replacement \n%s",
161 opt_context.get_help(true, null));
162 GLib.Process.exit(Posix.EXIT_FAILURE);
164 if ((opt_is_replacing || opt_is_extracting ) && (opt_target_path == null || opt_target_path.length < 1)) {
165 stdout.printf ("You must specify a target path to put attachments\n%s",
166 opt_context.get_help(true, null));
167 GLib.Process.exit(Posix.EXIT_FAILURE);
171 } catch (GLib.OptionError e) {
172 stdout.printf ("error: %s\n", e.message);
173 stdout.printf ("Run '%s --help' to see a full list of available command line options.\n%s",
174 args[0], opt_context.get_help(true, null));
175 GLib.Process.exit(Posix.EXIT_FAILURE);
179 public static int main(string[] args)
182 var application = new StripApplication( args);
184 GLib.Log.set_always_fatal(LogLevelFlags.LEVEL_ERROR | LogLevelFlags.LEVEL_CRITICAL);
186 if (opt_debug || opt_debug_sql) {
187 GLib.Log.set_handler(null,
188 GLib.LogLevelFlags.LEVEL_DEBUG | GLib.LogLevelFlags.LEVEL_WARNING | GLib.LogLevelFlags.LEVEL_INFO,
196 if (StripApplication.opt_is_replacing) {
197 StripApplication.opt_is_extracting = true;
200 GLib.debug("scanning folder: %s", opt_path );
202 var strip = new Strip( opt_path );
205 strip.mysql = new Mysql.Database();
206 if (!strip.mysql.real_connect(
209 opt_db_pass == null ? "" : opt_db_pass, //passwd
211 3306, // not changable...?
215 stdout.printf("ERROR %u: Connection failed: %s\n",
216 strip.mysql.errno(), strip.mysql.error()
220 if (opt_file != null) {
221 strip.base_dir = opt_path;
222 strip.scan_file( GLib.Path.get_dirname(opt_file), GLib.Path.get_basename(opt_file));
226 strip.scan_dir(opt_path);
234 public class Strip : GLib.Object {
238 public string base_dir = "";
240 public Mysql.Database mysql;
244 uint64 used_space_before = 0;
245 uint64 used_space_after = 0;
248 public Strip(string base_dir)
250 this.base_dir = base_dir;
253 public void handle_part(GMime.Object parent, GMime.Object mime_obj)
255 if (mime_obj is GMime.Part) {
256 var p = (GMime.Part)mime_obj;
257 var ct = p.get_content_type();
258 var cd = p.get_content_disposition();
260 var sid = p.get_header("X-strip-id");
261 if (sid != null && sid.length > 0) {
262 this.update_attachment_db(p);
263 GLib.debug("Skip attachment replace - it's already been done");
267 if (cd == null || cd.get_disposition().down() != "attachment") {
270 if (ct.get_media_type() == "text") {
273 if (ct.to_string() == "application/pgp-encrypted") {
276 if (ct.to_string() == "application/pgp-keys") {
279 if (p.get_filename() == null) {
282 // print("got part %s\n", ct.to_string());
283 if (parent is GMime.Multipart) {
285 this.replace_attachment(((GMime.Multipart)parent), p);
293 if (mime_obj is GMime.Multipart) {
296 var mp = (GMime.Multipart)mime_obj;
297 //var ct = mp.get_content_type();
299 //print("got multi-part %s\n", ct.to_string());
300 for (var i = 0; i< mp.get_count(); i++) {
301 var mo = mp.get_part(i);
302 this.handle_part(mime_obj,mo);
304 // ((GMime.Multipart)mime_obj).foreach((sub_obj) => {
305 // Strip.handle_part(sub_obj);
313 if (mime_obj is GMime.MessagePart) {
314 var msg = ((GMime.MessagePart)mime_obj).get_message();
315 msg.foreach((subobj) => {
316 this.handle_part(msg,subobj);
319 //print("got message-part\n");
323 if (mime_obj is GMime.Message) {
324 var mp = ((GMime.Message) mime_obj).get_mime_part();
326 if (!(mp is GMime.Multipart)) {
327 GLib.debug("get mimepart does not return a Multipart?");
331 var mpc = ((GMime.Multipart)mp).get_count();
333 //GLib.debug("Message has %d parts", mpc);
334 for (var i =0 ; i < mpc; i++) {
335 //GLib.debug("Getting part %d", i);
336 var submime_obj = ((GMime.Multipart)mp).get_part(i);
337 this.handle_part(mp,submime_obj);
339 print("got message??\n");
343 print("got something else\n");
347 public void update_attachment_db(GMime.Part attachment)
349 // only called when we have an sid...
350 var sid = attachment.get_header("X-strip-id");
351 if (sid == null || sid.length < 1) {
352 GLib.debug("Strange - update attachment db called ?");
356 // initialize it with known data..
357 // that should wipe out dupes.
377 SELECT attachment_update(
378 %d, -- in_id INT(11),
379 '%s', -- in_mime_type varchar(255),
380 %d, -- in_mime_size int(11),
381 '%s', -- in_created DATETIME,
382 '%s' -- in_mailfort_sig varchar(64)
389 "", // this will be ignored..
390 0, // this will be ingored..
392 this.mysql_escape(this.active_message_x_mailfort_sig)
402 public void replace_attachment(GMime.Multipart parent, GMime.Part attachment)
404 var sid = attachment.get_header("X-strip-id");
405 if (sid != null && sid.length > 0) {
406 GLib.debug("Skip attachment replace - it's already been done");
410 var c = attachment.get_content_object();
412 var filename = attachment.get_filename().replace("/", "-");
413 var fn = GLib.Environment.get_tmp_dir() +
414 "/"+ this.active_name + "."+ filename;
416 var outfile = new GMime.StreamFile.for_path(fn, "w");
417 outfile.set_owner(true);
418 var file_size = (int) c.write_to_stream(outfile);
419 var chksum = this.md5_file(fn);
423 if (file_size == 0) {
425 GLib.debug("ERROR - file size of write to stream returned 0?");
433 var mime_type= attachment.get_content_type().to_string();
434 // at this point we have to do our database magic...
435 //filesize / name / date / checksum / mimetype -- into mailfort should be OK.
441 '%s', -- in_msgid VARCHAR(32),
442 '%s', -- in_checksum VARCHAR(64),
443 '%s', -- in_mime_filename varchar(255)
448 this.mysql_escape(this.active_message_exim_id),
450 this.mysql_escape( attachment.get_filename() ), // what is thsi is invalid?
454 var rs = mysql.use_result();
456 var row = rs.fetch_row();
460 //var create_dir = this.created_dir;
463 if (rs == null || rs.num_rows() < 1) {
464 GLib.debug("ERROR - CALL to attachment_init failed");
470 if (int.parse(file_id) < 1) {
471 GLib.debug("ERROR - CALL to attachment_init failed - returned 0?");
478 GLib.debug("fn = %s, m5=%s, id= %s", filename, mime_type, this.active_message_id);
481 SELECT attachment_update(
482 %d, -- in_id INT(11),
483 '%s', -- in_mime_type varchar(255),
484 '%s', -- in_created DATETIME,
485 '%s', -- in_mailfort_sig varchar(64)
490 this.mysql_escape(mime_type),
492 this.mysql_escape(this.active_message_x_mailfort_sig)
497 this.used_space_after += file_size;
501 if (StripApplication.opt_is_extracting) {
502 target_fn = StripApplication.opt_target_path + "/" + this.created_dir +"/"+ file_id + "-" + filename;
505 var stored = "/" + this.created_dir +"/"+ file_id + "-" + filename;
508 SELECT attachment_update_store(
509 %d, -- in_id INT(11),
510 '%s' -- in_store_filename varchar(255),
516 this.mysql_escape( stored)
520 var rep = new GMime.Part.with_type("text","html");
521 // we have to set up a redirect server - to redirect hpasite... to their internal service..
522 rep.set_filename(filename);
523 string txt = "<html><body>"+
524 "<a href=\"" + StripApplication.opt_replace_link + "/" +
525 file_id + "/" + this.created_dir + "/"+chksum+"/"+ GLib.Uri.escape_string( filename) +"\">" +
526 GLib.Uri.escape_string( filename) + // fixme needs html escaping...
530 rep.get_content_type().set_parameter("charset", "utf-8");
531 rep.set_header("X-strip-id", file_id);
532 rep.set_header("X-strip-content-name", filename);
533 rep.set_header("X-strip-path", this.created_dir + "/" + file_id + "-" + filename);
534 rep.set_header("X-strip-content-type", mime_type);
535 var stream = new GMime.StreamMem.with_buffer(txt.data);
536 var con = new GMime.DataWrapper.with_stream(stream,GMime.ContentEncoding.DEFAULT);
538 rep.set_content_object(con);
539 GLib.debug("Replacing Attachment with HTML");
540 parent.replace(parent.index_of(attachment), rep);
541 this.has_replaced = true;
543 if (StripApplication.opt_is_extracting && target_fn.length > 0) {
544 var dir = GLib.Path.get_dirname(target_fn);
545 if (!FileUtils.test (dir, FileTest.IS_DIR)) {
546 GLib.DirUtils.create_with_parents(dir, 0755);
548 GLib.debug("Creating file %s", target_fn);
549 if (!FileUtils.test (target_fn, FileTest.EXISTS)) {
550 Posix.link(fn, target_fn);
553 GLib.debug("Skipping extraction %s", target_fn);
561 public int query(string str)
563 if (StripApplication.opt_debug_sql) {
564 GLib.debug("SQL: %s\n", str);
567 var rc= this.mysql.query(str);
570 GLib.debug("ERROR %u: Query failed: %s\n", this.mysql.errno(), this.mysql.error());
575 public string mysql_escape(string str)
577 unichar[] value_escaped = new unichar[str.length * 2 + 1];
578 this.mysql.real_escape_string ((string) value_escaped, str, str.length);
579 return (string) value_escaped;
582 public string md5_file(string fn) {
583 Checksum checksum = new Checksum (ChecksumType.MD5);
585 FileStream stream = FileStream.open (fn, "rb");
589 while ((size = stream.read (fbuf)) > 0) {
590 checksum.update (fbuf, size);
593 unowned string digest = checksum.get_string ();
597 string active_path = "";
598 string active_name = "";
599 string active_message_id = "";
600 string active_message_x_mailfort_sig = "";
601 string active_message_exim_id = "";
602 bool has_replaced = false;
603 string created_date = ""; // should be YYYY-mm-dd
604 string created_dir = ""; // should be YYY/mm/dd
606 public void scan_file(string path, string name)
608 GLib.debug("Scan: %s/%s", path,name);
610 this.has_replaced = false;
611 this.active_path = path;
612 this.active_name = name;
613 this.active_message_id = "";
615 this.created_dir = this.active_path.substring(this.base_dir.length + 1 );
616 this.created_date = this.created_dir.replace("/", "-");
618 var fileinfo = File.new_for_path(path +"/" + name)
619 .query_info(GLib.FileAttribute.STANDARD_SIZE+","+GLib.FileAttribute.TIME_MODIFIED
620 ,GLib.FileQueryInfoFlags.NONE,null);
621 var file_size = (int) fileinfo.get_size();
622 var mod_time = fileinfo.get_modification_time();
624 this.used_space_before += file_size;
626 var stream = new GMime.StreamFs.for_path (path +"/" + name,Posix.O_RDONLY, 0);
627 //stream.set_owner(true);
628 var parser = new GMime.Parser.with_stream(stream);
629 var message = parser.construct_message();
631 if (message == null) {
632 GLib.debug("Could not parse file? %s/%s", path,name);
633 this.used_space_after += file_size;
638 // check : - is message over a year old?
639 // get various msg info..
640 this.active_message_id = message.get_message_id();
641 this.active_message_x_mailfort_sig = message.get_header("x-mailfort-sig");
642 var recvd = message.get_header("received");
643 this.active_message_exim_id = "";
644 if (recvd != null && recvd.length > 1) {
645 // GLib.debug("RECV: %s", recvd);
646 var lines = recvd.split("\t");
647 for (var i = 0; i < lines.length;i++) {
648 var bits = lines[i].strip().split(" ");
649 if (bits[0] == "id") {
650 this.active_message_exim_id = bits[1];
655 GLib.debug("Message DATA:\n mid: %s\nmailfort: %s \nexim_id: %s",
656 this.active_message_id,
657 this.active_message_x_mailfort_sig,
658 this.active_message_exim_id
664 var mp = message.get_mime_part();
666 if (!(mp is GMime.Multipart)) {
667 GLib.debug("get mimepart does not return a Multipart?");
668 this.used_space_after += file_size;
672 var mpc = ((GMime.Multipart)mp).get_count();
674 //GLib.debug("Message has %d parts", mpc);
675 for (var i =0 ; i < mpc; i++) {
676 //GLib.debug("Getting part %d", i);
677 var mime_obj = ((GMime.Multipart)mp).get_part(i);
678 this.handle_part(mp,mime_obj);
683 // stream.set_owner(false);
685 stream = null;//.close();
688 if (!this.has_replaced) {
689 this.used_space_after += file_size;
690 GLib.debug("skpping write file - no replacement occured");
694 GMime.Stream outstream = new GMime.StreamNull();
695 if (StripApplication.opt_is_replacing) {
697 tmpfile = GLib.Environment.get_tmp_dir() +"/" + name;
698 outstream = new GMime.StreamFile.for_path (tmpfile,"w");
699 ((GMime.StreamFile)outstream).set_owner(true);
701 if (StripApplication.opt_dump) {
702 outstream = new GMime.StreamMem();
705 file_size = (int) message.write_to_stream(outstream);
706 if (StripApplication.opt_is_replacing) {
707 ((GMime.StreamFile)outstream).set_owner(false);
709 if (StripApplication.opt_dump) {
710 var ua = ((GMime.StreamMem)outstream).get_byte_array().data;
711 print("%s\n", (string) ua);
716 GLib.debug("finished writing output %d", file_size);
721 this.used_space_after += file_size;
724 if (StripApplication.opt_is_replacing) {
725 Posix.unlink(path +"/" + name);
726 GLib.debug("copy tmp file %s to %s" , tmpfile, path +"/" + name);
727 Posix.link(tmpfile, path +"/" + name);
728 Posix.unlink(tmpfile);
729 var nf = File.new_for_path(path +"/" + name);
730 var newfileinfo = nf.query_info(GLib.FileAttribute.TIME_MODIFIED,GLib.FileQueryInfoFlags.NONE,null);
731 newfileinfo.set_modification_time(mod_time);
732 nf.set_attributes_from_info(newfileinfo,FileQueryInfoFlags.NONE);
736 if (StripApplication.opt_limit > -1 && this.processed >= StripApplication.opt_limit) {
737 GLib.debug("Reached replacement limit");
747 public void scan_dir(string path)
749 var f = File.new_for_path(path);
750 FileEnumerator file_enum;
751 var cancellable = new Cancellable ();
753 file_enum = f.enumerate_children(
754 FileAttribute.STANDARD_DISPLAY_NAME + "," + FileAttribute.STANDARD_TYPE,
755 FileQueryInfoFlags.NOFOLLOW_SYMLINKS, // FileQueryInfoFlags.NONE,
759 GLib.debug("Got error scanning dir? %s", e.message);
760 // FIXME - show error..
765 while (cancellable.is_cancelled () == false ) {
767 next_file = file_enum.next_file (cancellable);
769 GLib.debug("error getting next file? %s", e.message);
773 if (next_file == null) {
778 if (next_file.get_file_type() != FileType.DIRECTORY) {
780 if (next_file.get_display_name()[0] == ',') {
784 this.scan_file(path , next_file.get_display_name());
785 this.report_state("After scanning %s/%s".printf(path , next_file.get_display_name()));
790 //stdout.printf("Monitor.monitor: got file %s : type :%u\n",
791 // next_file.get_display_name(), next_file.get_file_type());
794 if (next_file.get_is_symlink()) {
799 var ds = next_file.get_display_name();
804 if (ds == "attachments") {
809 var sp = path+"/"+next_file.get_display_name();
811 //print("got a file : " + sp);
822 void report_state(string msg)
824 // Saved: 2G Original 10G : 20%
825 GLib.debug("Saved : %s (%.1f%%) | Original %s | %s",
826 GLib.format_size(this.used_space_before - this.used_space_after),
827 100f * ((1f * (this.used_space_before - this.used_space_after)) / (this.used_space_before * 1f)),
828 GLib.format_size(this.used_space_before),