4 a) our mailfort email database
5 point it at the top directory, containing YEAR/MONTH/DAY.... directories.
6 scan each file (over a year old...)
7 extract out the attachment, and replace with HTML
8 DATABASE? - mysql or sqlite? -
9 filesize / name / date / checksum / mimetype -- into mailfort should be OK.
10 b) the imap user emails
11 loop through user's directories
12 check age of email .. over 1 years..
13 ?? how to prevent 'repeat' scanning of emails?
14 ??? hidden '.' files containing last scan date?
16 check if file exists in our DB.. - replace the link...
17 otherwise generate a file. + add to DB...
22 -> URL -> redirect to correct server
25 More notes on our Mailfort DB sync:
26 * some of these attachments are already in the database...
27 - so we need to update the DB..
28 - probably worth putting the code in a stored procedure..
31 * first scan (and extract)
32 * rescan (as I messed up the first time - fix the DB...)
33 * email scan - attachments might not have related messages.
36 - {id} attachment_init(
41 // creates or returns id (can look for existing messages?
42 // can do a merge?? - copy 'old' record data into 'new'.... "prefer checksummed"
49 {created} // message date..
54 attachment_update_store(
62 // valac --pkg gmime --vapi
65 // http://www.fromdual.com/mysql-vala-program-example << check mysql if this does not work.
67 valac -g --vapidir=. --thread strip.vala --vapidir=../vapi \
68 --pkg glib-2.0 --pkg mysql --pkg gio-2.0 --pkg posix --pkg gmime-2.6 \
69 --Xcc=-lmysqlclient -v \
73 public class StripApplication : GLib.Application {
75 public static string? opt_path = null;
76 public static string? opt_file = null;
77 public static string? opt_target_path = null;
78 public static string? opt_db_host = "127.0.0.1";
79 public static string? opt_db_name = null;
80 public static string? opt_db_user = null;
81 public static string? opt_db_pass = null;
83 public static int opt_limit = -1;
85 public static bool opt_is_extracting = false;
86 public static bool opt_is_replacing = false;
87 public static bool opt_scan_imap = false;
88 public static bool opt_scan_mailfort = false;
89 public static bool opt_dump = false;
90 public static bool opt_debug = false;
92 public static bool opt_debug_sql = false;
93 public static string? opt_replace_link = null;
96 public const GLib.OptionEntry[] options = {
98 { "debug", 0, 0, OptionArg.NONE, ref opt_debug, "show debug messages for components", null },
99 { "debug-sql", 0, 0, OptionArg.NONE, ref opt_debug_sql, "debug the SQL statements", null },
101 { "path", 0, 0, OptionArg.STRING, ref opt_path, "Directory where email to be parsed is", null },
102 { "file", 0, 0, OptionArg.STRING, ref opt_file, "A specific file to be parsed", null },
104 { "target-path", 0, 0, OptionArg.STRING, ref opt_target_path, "Directory where attachments are to be put", null },
106 { "link", 0, 0, OptionArg.STRING, ref opt_replace_link, "url for the replement link: eg. http://www.mysite.com/xxxx/%s", null },
108 { "host", 0, 0, OptionArg.STRING, ref opt_db_host, "Mysql host (default localhost)", null },
109 { "name", 0, 0, OptionArg.STRING, ref opt_db_name, "Mysql database name REQUIRED", null },
110 { "user", 0, 0, OptionArg.STRING, ref opt_db_user, "Mysql database user REQUIRED", null },
111 { "pass", 0, 0, OptionArg.STRING, ref opt_db_pass, "Mysql database password (default empty)", null },
113 { "extract", 0, 0, OptionArg.NONE, ref opt_is_extracting, "Should attachments be extracted (default NO)", null },
114 { "replace", 0, 0, OptionArg.NONE, ref opt_is_replacing, "Should attachments be replaced (default NO)", null },
115 { "dump", 0, 0, OptionArg.NONE, ref opt_dump, "Print the replaced mail contents to stdout", null },
117 { "limit", 0, 0, OptionArg.INT, ref opt_limit, "stop after X number of messages with attachments have been processed", null },
119 { "scan-imap", 0, 0, OptionArg.NONE, ref opt_scan_imap, "scan an imap tree", null },
120 { "scan-mailfort", 0, 0, OptionArg.NONE, ref opt_scan_mailfort, "scan a mailfort tree", null },
123 public StripApplication( string[] args )
126 application_id: "org.roojs.mailstripper",
127 flags: ApplicationFlags.FLAGS_NONE
131 var opt_context = new GLib.OptionContext ("Mail Stripper");
135 opt_context.set_help_enabled (true);
136 opt_context.add_main_entries (options, null);
137 opt_context.parse ( ref args);
138 //opt_detach = !optx_no_detach;
142 // options that have to be set.. bee or hive... (or stop all)
143 if ((!opt_scan_mailfort && !opt_scan_imap) || (opt_scan_mailfort && opt_scan_imap)) {
144 stdout.printf ("You must specify the type of directory tree to scan - either imap or mailfort\n%s",
145 opt_context.get_help(true, null));
146 GLib.Process.exit(Posix.EXIT_FAILURE);
149 if ((opt_db_name == null || opt_db_name.length < 1 || opt_db_user == null || opt_db_user.length < 1)) {
150 stdout.printf ("You must specify the database name / user \n%s",
151 opt_context.get_help(true, null));
152 GLib.Process.exit(Posix.EXIT_FAILURE);
154 if ((opt_path == null || opt_path.length < 1) ) {
155 stdout.printf ("You must specify the scan start path\n%s",
156 opt_context.get_help(true, null));
157 GLib.Process.exit(Posix.EXIT_FAILURE);
159 if (opt_replace_link == null || (opt_replace_link.length < 1)) {
160 stdout.printf ("You must specify the link to use in the replacement \n%s",
161 opt_context.get_help(true, null));
162 GLib.Process.exit(Posix.EXIT_FAILURE);
164 if ((opt_is_replacing || opt_is_extracting ) && (opt_target_path == null || opt_target_path.length < 1)) {
165 stdout.printf ("You must specify a target path to put attachments\n%s",
166 opt_context.get_help(true, null));
167 GLib.Process.exit(Posix.EXIT_FAILURE);
171 } catch (GLib.OptionError e) {
172 stdout.printf ("error: %s\n", e.message);
173 stdout.printf ("Run '%s --help' to see a full list of available command line options.\n%s",
174 args[0], opt_context.get_help(true, null));
175 GLib.Process.exit(Posix.EXIT_FAILURE);
179 public static int main(string[] args)
182 var application = new StripApplication( args);
184 GLib.Log.set_always_fatal(LogLevelFlags.LEVEL_ERROR | LogLevelFlags.LEVEL_CRITICAL);
186 if (opt_debug || opt_debug_sql) {
187 GLib.Log.set_handler(null,
188 GLib.LogLevelFlags.LEVEL_DEBUG | GLib.LogLevelFlags.LEVEL_WARNING | GLib.LogLevelFlags.LEVEL_INFO,
196 if (StripApplication.opt_is_replacing) {
197 StripApplication.opt_is_extracting = true;
200 GLib.debug("scanning folder: %s", opt_path );
202 var strip = new Strip( opt_path );
205 strip.mysql = new Mysql.Database();
206 if (!strip.mysql.real_connect(
209 opt_db_pass == null ? "" : opt_db_pass, //passwd
211 3306, // not changable...?
215 stdout.printf("ERROR %u: Connection failed: %s\n",
216 strip.mysql.errno(), strip.mysql.error()
221 if (opt_file != null) {
222 strip.base_dir = opt_path;
223 strip.scan_file( GLib.Path.get_dirname(opt_file), GLib.Path.get_basename(opt_file));
227 strip.scan_dir(opt_path);
235 public class Strip : GLib.Object {
239 public string base_dir = "";
241 public Mysql.Database mysql;
245 uint64 used_space_before = 0;
246 uint64 used_space_after = 0;
249 public Strip(string base_dir)
251 this.base_dir = base_dir;
254 public void handle_part(GMime.Object parent, GMime.Object mime_obj)
256 if (mime_obj is GMime.Part) {
257 var p = (GMime.Part)mime_obj;
258 var ct = p.get_content_type();
259 var cd = p.get_content_disposition();
261 var sid = p.get_header("X-strip-id");
262 if (sid != null && sid.length > 0) {
263 this.update_attachment_db(p);
264 GLib.debug("Skip attachment replace - it's already been done");
268 if (cd == null || cd.get_disposition().down() != "attachment") {
271 if (ct.get_media_type() == "text") {
274 if (ct.to_string() == "application/pgp-encrypted") {
277 if (ct.to_string() == "application/pgp-keys") {
280 if (p.get_filename() == null) {
283 // print("got part %s\n", ct.to_string());
284 if (parent is GMime.Multipart) {
286 this.replace_attachment(((GMime.Multipart)parent), p);
294 if (mime_obj is GMime.Multipart) {
297 var mp = (GMime.Multipart)mime_obj;
298 //var ct = mp.get_content_type();
300 //print("got multi-part %s\n", ct.to_string());
301 for (var i = 0; i< mp.get_count(); i++) {
302 var mo = mp.get_part(i);
303 this.handle_part(mime_obj,mo);
305 // ((GMime.Multipart)mime_obj).foreach((sub_obj) => {
306 // Strip.handle_part(sub_obj);
314 if (mime_obj is GMime.MessagePart) {
315 var msg = ((GMime.MessagePart)mime_obj).get_message();
316 msg.foreach((subobj) => {
317 this.handle_part(msg,subobj);
320 //print("got message-part\n");
324 if (mime_obj is GMime.Message) {
325 var mp = ((GMime.Message) mime_obj).get_mime_part();
327 if (!(mp is GMime.Multipart)) {
328 //GLib.debug("get mimepart does not return a Multipart?");
332 var mpc = ((GMime.Multipart)mp).get_count();
334 //GLib.debug("Message has %d parts", mpc);
335 for (var i =0 ; i < mpc; i++) {
336 //GLib.debug("Getting part %d", i);
337 var submime_obj = ((GMime.Multipart)mp).get_part(i);
338 this.handle_part(mp,submime_obj);
340 print("got message??\n");
344 print("got something else\n");
348 public void update_attachment_db(GMime.Part attachment)
350 // only called when we have an sid...
351 var sid = attachment.get_header("X-strip-id");
352 if (sid == null || sid.length < 1) {
353 GLib.debug("Strange - update attachment db called ?");
357 // initialize it with known data..
358 // that should wipe out dupes.
359 var filesize = this.query("SELECT filesize FROM Attachment WHERE id = %d".printf(
361 if (int.parse(filesize) < 1) {
362 GLib.debug("Could not get filesize from id :%s = %s", sid,filesize);
367 var chksum = this.query("SELECT checksum FROM Attachment WHERE id = %d".printf(
370 var mime_filename = this.query("SELECT mime_filename FROM Attachment WHERE id = %d".printf(
380 this.mysql_escape(this.active_message_exim_id),
381 this.mysql_escape(chksum),
382 this.mysql_escape(mime_filename),
386 SELECT attachment_update(
387 %d, -- in_id INT(11),
388 '%s', -- in_mime_type varchar(255),
389 '%s', -- in_created DATETIME,
390 '%s' -- in_mailfort_sig varchar(64)
394 "", // this will be ignored..
396 this.mysql_escape(this.active_message_x_mailfort_sig)
400 this.mysql.store_result();
407 public void replace_attachment(GMime.Multipart parent, GMime.Part attachment)
409 var sid = attachment.get_header("X-strip-id");
410 if (sid != null && sid.length > 0) {
411 GLib.debug("Skip attachment replace - it's already been done");
415 var c = attachment.get_content_object();
417 var filename = attachment.get_filename().replace("/", "-");
418 var fn = GLib.Environment.get_tmp_dir() +
419 "/"+ this.active_name + "."+ filename;
421 var outfile = new GMime.StreamFile.for_path(fn, "w");
422 outfile.set_owner(true);
423 var file_size = (int) c.write_to_stream(outfile);
424 var chksum = this.md5_file(fn);
428 if (file_size == 0) {
430 GLib.debug("ERROR - file size of write to stream returned 0?");
438 var mime_type= attachment.get_content_type().to_string();
439 // at this point we have to do our database magic...
440 //filesize / name / date / checksum / mimetype -- into mailfort should be OK.
442 var file_id = this.query("""
446 '%s', -- in_msgid VARCHAR(32),
447 '%s', -- in_checksum VARCHAR(64),
448 '%s', -- in_mime_filename varchar(255)
453 this.mysql_escape(this.active_message_exim_id),
455 this.mysql_escape( attachment.get_filename() ), // what is thsi is invalid?
460 if (file_id.length < 1) {
461 GLib.debug("ERROR - CALL to attachment_init failed");
467 if (int.parse(file_id) < 1) {
468 GLib.debug("ERROR - CALL to attachment_init failed - returned 0?");
475 GLib.debug("fn = %s, m5=%s, id= %s", filename, mime_type, this.active_message_id);
478 SELECT attachment_update(
479 %d, -- in_id INT(11),
480 '%s', -- in_mime_type varchar(255),
481 '%s', -- in_created DATETIME,
482 '%s' -- in_mailfort_sig varchar(64)
487 this.mysql_escape(mime_type),
489 this.mysql_escape(this.active_message_x_mailfort_sig)
491 this.mysql.store_result();
494 this.used_space_after += file_size;
498 if (StripApplication.opt_is_extracting) {
499 target_fn = StripApplication.opt_target_path + "/" + this.created_dir +"/"+ file_id + "-" + filename;
502 var stored = "/" + this.created_dir +"/"+ file_id + "-" + filename;
505 SELECT attachment_update_store(
506 %d, -- in_id INT(11),
507 '%s' -- in_store_filename varchar(255),
513 this.mysql_escape( stored)
516 var rep = new GMime.Part.with_type("text","html");
517 // we have to set up a redirect server - to redirect hpasite... to their internal service..
518 rep.set_filename(filename);
519 string txt = "<html><body>"+
520 "<a href=\"" + StripApplication.opt_replace_link + "/" +
521 file_id + "/" + this.created_dir + "/"+chksum+"/"+ GLib.Uri.escape_string( filename) +"\">" +
522 GLib.Uri.escape_string( filename) + // fixme needs html escaping...
526 rep.get_content_type().set_parameter("charset", "utf-8");
527 rep.set_header("X-strip-id", file_id);
528 rep.set_header("X-strip-content-name", filename);
529 rep.set_header("X-strip-path", this.created_dir + "/" + file_id + "-" + filename);
530 rep.set_header("X-strip-content-type", mime_type);
531 var stream = new GMime.StreamMem.with_buffer(txt.data);
532 var con = new GMime.DataWrapper.with_stream(stream,GMime.ContentEncoding.DEFAULT);
534 rep.set_content_object(con);
535 GLib.debug("Replacing Attachment with HTML");
536 parent.replace(parent.index_of(attachment), rep);
537 this.has_replaced = true;
539 if (StripApplication.opt_is_extracting && target_fn.length > 0) {
540 var dir = GLib.Path.get_dirname(target_fn);
541 if (!FileUtils.test (dir, FileTest.IS_DIR)) {
542 GLib.DirUtils.create_with_parents(dir, 0755);
544 GLib.debug("Creating file %s", target_fn);
545 if (!FileUtils.test (target_fn, FileTest.EXISTS)) {
546 Posix.link(fn, target_fn);
549 GLib.debug("Skipping extraction %s", target_fn);
556 public string query(string str)
558 return this.real_query(true, str);
560 public string execute(string str)
562 return this.real_query(false, str);
564 public string real_query(bool need_return, string str)
566 GLib.debug("Before Query : %u : %s\n", this.mysql.errno(), this.mysql.error());
569 if (StripApplication.opt_debug_sql) {
570 GLib.debug("SQL: %s\n", str);
575 var rc= this.mysql.query(str);
578 GLib.debug("ERROR %u: Query failed: %s\n", this.mysql.errno(), this.mysql.error());
583 var rs = mysql.use_result();
588 while( (row = rs.fetch_row()) != null) {
597 GLib.debug("ERROR : no rows returned");
601 GLib.debug("got %s", ret);
607 public string mysql_escape(string str)
609 unichar[] value_escaped = new unichar[str.length * 2 + 1];
610 this.mysql.real_escape_string ((string) value_escaped, str, str.length);
611 return (string) value_escaped;
614 public string md5_file(string fn) {
615 Checksum checksum = new Checksum (ChecksumType.MD5);
617 FileStream stream = FileStream.open (fn, "rb");
621 while ((size = stream.read (fbuf)) > 0) {
622 checksum.update (fbuf, size);
625 unowned string digest = checksum.get_string ();
629 string active_path = "";
630 string active_name = "";
631 string active_message_id = "";
632 string active_message_x_mailfort_sig = "";
633 string active_message_exim_id = "";
634 bool has_replaced = false;
635 string created_date = ""; // should be YYYY-mm-dd
636 string created_dir = ""; // should be YYY/mm/dd
638 public void scan_file(string path, string name)
640 GLib.debug("Scan: %s/%s", path,name);
642 this.has_replaced = false;
643 this.active_path = path;
644 this.active_name = name;
645 this.active_message_id = "";
647 this.created_dir = this.active_path.substring(this.base_dir.length + 1 );
648 this.created_date = this.created_dir.replace("/", "-");
650 var fileinfo = File.new_for_path(path +"/" + name)
651 .query_info(GLib.FileAttribute.STANDARD_SIZE+","+GLib.FileAttribute.TIME_MODIFIED
652 ,GLib.FileQueryInfoFlags.NONE,null);
653 var file_size = (int) fileinfo.get_size();
654 var mod_time = fileinfo.get_modification_time();
656 this.used_space_before += file_size;
658 var stream = new GMime.StreamFs.for_path (path +"/" + name,Posix.O_RDONLY, 0);
659 //stream.set_owner(true);
660 var parser = new GMime.Parser.with_stream(stream);
661 var message = parser.construct_message();
663 if (message == null) {
664 GLib.debug("Could not parse file? %s/%s", path,name);
665 this.used_space_after += file_size;
670 // check : - is message over a year old?
671 // get various msg info..
672 this.active_message_id = message.get_message_id();
673 this.active_message_x_mailfort_sig = message.get_header("x-mailfort-sig");
674 var recvd = message.get_header("received");
675 this.active_message_exim_id = "";
676 if (recvd != null && recvd.length > 1) {
677 // GLib.debug("RECV: %s", recvd);
678 var lines = recvd.split("\t");
679 for (var i = 0; i < lines.length;i++) {
680 var bits = lines[i].strip().split(" ");
681 if (bits[0] == "id") {
682 this.active_message_exim_id = bits[1];
687 GLib.debug("Message DATA:\n mid: %s\nmailfort: %s \nexim_id: %s",
688 this.active_message_id,
689 this.active_message_x_mailfort_sig,
690 this.active_message_exim_id
696 var mp = message.get_mime_part();
698 if (!(mp is GMime.Multipart)) {
699 //GLib.debug("get mimepart does not return a Multipart?");
700 this.used_space_after += file_size;
704 var mpc = ((GMime.Multipart)mp).get_count();
706 //GLib.debug("Message has %d parts", mpc);
707 for (var i =0 ; i < mpc; i++) {
708 //GLib.debug("Getting part %d", i);
709 var mime_obj = ((GMime.Multipart)mp).get_part(i);
710 this.handle_part(mp,mime_obj);
715 // stream.set_owner(false);
717 stream = null;//.close();
720 if (!this.has_replaced) {
721 this.used_space_after += file_size;
722 GLib.debug("skpping write file - no replacement occured");
726 GMime.Stream outstream = new GMime.StreamNull();
727 if (StripApplication.opt_is_replacing) {
729 tmpfile = GLib.Environment.get_tmp_dir() +"/" + name;
730 outstream = new GMime.StreamFile.for_path (tmpfile,"w");
731 ((GMime.StreamFile)outstream).set_owner(true);
733 if (StripApplication.opt_dump) {
734 outstream = new GMime.StreamMem();
737 file_size = (int) message.write_to_stream(outstream);
738 if (StripApplication.opt_is_replacing) {
739 ((GMime.StreamFile)outstream).set_owner(false);
741 if (StripApplication.opt_dump) {
742 var ua = ((GMime.StreamMem)outstream).get_byte_array().data;
743 print("%s\n", (string) ua);
748 GLib.debug("finished writing output %d", file_size);
753 this.used_space_after += file_size;
756 if (StripApplication.opt_is_replacing) {
757 Posix.unlink(path +"/" + name);
758 GLib.debug("copy tmp file %s to %s" , tmpfile, path +"/" + name);
759 Posix.link(tmpfile, path +"/" + name);
760 Posix.unlink(tmpfile);
761 var nf = File.new_for_path(path +"/" + name);
762 var newfileinfo = nf.query_info(GLib.FileAttribute.TIME_MODIFIED,GLib.FileQueryInfoFlags.NONE,null);
763 newfileinfo.set_modification_time(mod_time);
764 nf.set_attributes_from_info(newfileinfo,FileQueryInfoFlags.NONE);
768 if (StripApplication.opt_limit > -1 && this.processed >= StripApplication.opt_limit) {
769 GLib.debug("Reached replacement limit");
779 public void scan_dir(string path)
781 var f = File.new_for_path(path);
782 FileEnumerator file_enum;
783 var cancellable = new Cancellable ();
785 file_enum = f.enumerate_children(
786 FileAttribute.STANDARD_DISPLAY_NAME + "," + FileAttribute.STANDARD_TYPE,
787 FileQueryInfoFlags.NOFOLLOW_SYMLINKS, // FileQueryInfoFlags.NONE,
791 GLib.debug("Got error scanning dir? %s", e.message);
792 // FIXME - show error..
797 while (cancellable.is_cancelled () == false ) {
799 next_file = file_enum.next_file (cancellable);
801 GLib.debug("error getting next file? %s", e.message);
805 if (next_file == null) {
810 if (next_file.get_file_type() != FileType.DIRECTORY) {
812 if (next_file.get_display_name()[0] == ',') {
816 this.scan_file(path , next_file.get_display_name());
817 if(this.has_replaced) {
818 this.report_state("After scanning %s/%s".printf(path , next_file.get_display_name()));
824 //stdout.printf("Monitor.monitor: got file %s : type :%u\n",
825 // next_file.get_display_name(), next_file.get_file_type());
828 if (next_file.get_is_symlink()) {
833 var ds = next_file.get_display_name();
838 if (ds == "attachments") {
843 var sp = path+"/"+next_file.get_display_name();
845 //print("got a file : " + sp);
856 void report_state(string msg)
858 // Saved: 2G Original 10G : 20%
859 GLib.debug("Saved : %s (%.1f%%) | Original %s | %s",
860 GLib.format_size(this.used_space_before - this.used_space_after),
861 100f * ((1f * (this.used_space_before - this.used_space_after)) / (this.used_space_before * 1f)),
862 GLib.format_size(this.used_space_before),