4 a) our mailfort email database
5 point it at the top directory, containing YEAR/MONTH/DAY.... directories.
6 scan each file (over a year old...)
7 extract out the attachment, and replace with HTML
8 DATABASE? - mysql or sqlite? -
9 filesize / name / date / checksum / mimetype -- into mailfort should be OK.
10 b) the imap user emails
11 loop through user's directories
12 check age of email .. over 1 years..
13 ?? how to prevent 'repeat' scanning of emails?
14 ??? hidden '.' files containing last scan date?
16 check if file exists in our DB.. - replace the link...
17 otherwise generate a file. + add to DB...
22 -> URL -> redirect to correct server
25 More notes on our Mailfort DB sync:
26 * some of these attachments are already in the database...
27 - so we need to update the DB..
28 - probably worth putting the code in a stored procedure..
31 * first scan (and extract)
32 * rescan (as I messed up the first time - fix the DB...)
33 * email scan - attachments might not have related messages.
36 - {id} attachment_init(
41 // creates or returns id (can look for existing messages?
42 // can do a merge?? - copy 'old' record data into 'new'.... "prefer checksummed"
49 {created} // message date..
54 attachment_update_store(
62 // valac --pkg gmime --vapi
65 // http://www.fromdual.com/mysql-vala-program-example << check mysql if this does not work.
67 valac -g --vapidir=. --thread strip.vala --vapidir=../vapi \
68 --pkg glib-2.0 --pkg mysql --pkg gio-2.0 --pkg posix --pkg gmime-2.6 \
69 --Xcc=-lmysqlclient -v \
73 public class StripApplication : GLib.Application {
75 public static string? opt_path = null;
76 public static string? opt_file = null;
77 public static string? opt_target_path = null;
78 public static string? opt_db_host = "127.0.0.1";
79 public static string? opt_db_name = null;
80 public static string? opt_db_user = null;
81 public static string? opt_db_pass = null;
83 public static int opt_limit = -1;
85 public static bool opt_is_extracting = false;
86 public static bool opt_is_replacing = false;
87 public static bool opt_scan_imap = false;
88 public static bool opt_scan_mailfort = false;
89 public static bool opt_dump = false;
90 public static bool opt_debug = false;
92 public static bool opt_debug_sql = false;
93 public static string? opt_replace_link = null;
96 public const GLib.OptionEntry[] options = {
98 { "debug", 0, 0, OptionArg.NONE, ref opt_debug, "show debug messages for components", null },
99 { "debug-sql", 0, 0, OptionArg.NONE, ref opt_debug_sql, "debug the SQL statements", null },
101 { "path", 0, 0, OptionArg.STRING, ref opt_path, "Directory where email to be parsed is", null },
102 { "file", 0, 0, OptionArg.STRING, ref opt_file, "A specific file to be parsed", null },
104 { "target-path", 0, 0, OptionArg.STRING, ref opt_target_path, "Directory where attachments are to be put", null },
106 { "link", 0, 0, OptionArg.STRING, ref opt_replace_link, "url for the replement link: eg. http://www.mysite.com/xxxx/%s", null },
108 { "host", 0, 0, OptionArg.STRING, ref opt_db_host, "Mysql host (default localhost)", null },
109 { "name", 0, 0, OptionArg.STRING, ref opt_db_name, "Mysql database name REQUIRED", null },
110 { "user", 0, 0, OptionArg.STRING, ref opt_db_user, "Mysql database user REQUIRED", null },
111 { "pass", 0, 0, OptionArg.STRING, ref opt_db_pass, "Mysql database password (default empty)", null },
113 { "extract", 0, 0, OptionArg.NONE, ref opt_is_extracting, "Should attachments be extracted (default NO)", null },
114 { "replace", 0, 0, OptionArg.NONE, ref opt_is_replacing, "Should attachments be replaced (default NO)", null },
115 { "dump", 0, 0, OptionArg.NONE, ref opt_dump, "Print the replaced mail contents to stdout", null },
117 { "limit", 0, 0, OptionArg.INT, ref opt_limit, "stop after X number of messages with attachments have been processed", null },
119 { "scan-imap", 0, 0, OptionArg.NONE, ref opt_scan_imap, "scan an imap tree", null },
120 { "scan-mailfort", 0, 0, OptionArg.NONE, ref opt_scan_mailfort, "scan a mailfort tree", null },
123 public StripApplication( string[] args )
126 application_id: "org.roojs.mailstripper",
127 flags: ApplicationFlags.FLAGS_NONE
131 var opt_context = new GLib.OptionContext ("Mail Stripper");
135 opt_context.set_help_enabled (true);
136 opt_context.add_main_entries (options, null);
137 opt_context.parse ( ref args);
138 //opt_detach = !optx_no_detach;
142 // options that have to be set.. bee or hive... (or stop all)
143 if ((!opt_scan_mailfort && !opt_scan_imap) || (opt_scan_mailfort && opt_scan_imap)) {
144 stdout.printf ("You must specify the type of directory tree to scan - either imap or mailfort\n%s",
145 opt_context.get_help(true, null));
146 GLib.Process.exit(Posix.EXIT_FAILURE);
149 if ((opt_db_name == null || opt_db_name.length < 1 || opt_db_user == null || opt_db_user.length < 1)) {
150 stdout.printf ("You must specify the database name / user \n%s",
151 opt_context.get_help(true, null));
152 GLib.Process.exit(Posix.EXIT_FAILURE);
154 if ((opt_path == null || opt_path.length < 1) ) {
155 stdout.printf ("You must specify the scan start path\n%s",
156 opt_context.get_help(true, null));
157 GLib.Process.exit(Posix.EXIT_FAILURE);
159 if (opt_replace_link == null || (opt_replace_link.length < 1)) {
160 stdout.printf ("You must specify the link to use in the replacement \n%s",
161 opt_context.get_help(true, null));
162 GLib.Process.exit(Posix.EXIT_FAILURE);
164 if ((opt_is_replacing || opt_is_extracting ) && (opt_target_path == null || opt_target_path.length < 1)) {
165 stdout.printf ("You must specify a target path to put attachments\n%s",
166 opt_context.get_help(true, null));
167 GLib.Process.exit(Posix.EXIT_FAILURE);
171 } catch (GLib.OptionError e) {
172 stdout.printf ("error: %s\n", e.message);
173 stdout.printf ("Run '%s --help' to see a full list of available command line options.\n%s",
174 args[0], opt_context.get_help(true, null));
175 GLib.Process.exit(Posix.EXIT_FAILURE);
179 public static int main(string[] args)
182 var application = new StripApplication( args);
184 GLib.Log.set_always_fatal(LogLevelFlags.LEVEL_ERROR | LogLevelFlags.LEVEL_CRITICAL);
186 if (opt_debug || opt_debug_sql) {
187 GLib.Log.set_handler(null,
188 GLib.LogLevelFlags.LEVEL_DEBUG | GLib.LogLevelFlags.LEVEL_WARNING | GLib.LogLevelFlags.LEVEL_INFO,
196 if (StripApplication.opt_is_replacing) {
197 StripApplication.opt_is_extracting = true;
200 GLib.debug("scanning folder: %s", opt_path );
202 var strip = new Strip( opt_path );
205 strip.mysql = new Mysql.Database();
206 if (!strip.mysql.real_connect(
209 opt_db_pass == null ? "" : opt_db_pass, //passwd
211 3306, // not changable...?
215 stdout.printf("ERROR %u: Connection failed: %s\n",
216 strip.mysql.errno(), strip.mysql.error()
220 if (opt_file != null) {
221 strip.base_dir = opt_path;
222 strip.scan_file( GLib.Path.get_dirname(opt_file), GLib.Path.get_basename(opt_file));
226 strip.scan_dir(opt_path);
234 public class Strip : GLib.Object {
238 public string base_dir = "";
240 public Mysql.Database mysql;
244 uint64 used_space_before = 0;
245 uint64 used_space_after = 0;
248 public Strip(string base_dir)
250 this.base_dir = base_dir;
253 public void handle_part(GMime.Object parent, GMime.Object mime_obj)
255 if (mime_obj is GMime.Part) {
256 var p = (GMime.Part)mime_obj;
257 var ct = p.get_content_type();
258 var cd = p.get_content_disposition();
260 var sid = p.get_header("X-strip-id");
261 if (sid != null && sid.length > 0) {
262 this.update_attachment_db(p);
263 GLib.debug("Skip attachment replace - it's already been done");
267 if (cd == null || cd.get_disposition().down() != "attachment") {
270 if (ct.get_media_type() == "text") {
273 if (ct.to_string() == "application/pgp-encrypted") {
276 if (ct.to_string() == "application/pgp-keys") {
279 if (p.get_filename() == null) {
282 // print("got part %s\n", ct.to_string());
283 if (parent is GMime.Multipart) {
285 this.replace_attachment(((GMime.Multipart)parent), p);
293 if (mime_obj is GMime.Multipart) {
296 var mp = (GMime.Multipart)mime_obj;
297 //var ct = mp.get_content_type();
299 //print("got multi-part %s\n", ct.to_string());
300 for (var i = 0; i< mp.get_count(); i++) {
301 var mo = mp.get_part(i);
302 this.handle_part(mime_obj,mo);
304 // ((GMime.Multipart)mime_obj).foreach((sub_obj) => {
305 // Strip.handle_part(sub_obj);
313 if (mime_obj is GMime.MessagePart) {
314 var msg = ((GMime.MessagePart)mime_obj).get_message();
315 msg.foreach((subobj) => {
316 this.handle_part(msg,subobj);
319 //print("got message-part\n");
323 if (mime_obj is GMime.Message) {
324 var mp = ((GMime.Message) mime_obj).get_mime_part();
326 if (!(mp is GMime.Multipart)) {
327 GLib.debug("get mimepart does not return a Multipart?");
331 var mpc = ((GMime.Multipart)mp).get_count();
333 //GLib.debug("Message has %d parts", mpc);
334 for (var i =0 ; i < mpc; i++) {
335 //GLib.debug("Getting part %d", i);
336 var submime_obj = ((GMime.Multipart)mp).get_part(i);
337 this.handle_part(mp,submime_obj);
339 print("got message??\n");
343 print("got something else\n");
347 public void update_attachment_db(GMime.Part attachment)
349 // only called when we have an sid...
350 var sid = attachment.get_header("X-strip-id");
351 if (sid == null || sid.length < 1) {
352 GLib.debug("Strange - update attachment db called ?");
356 SELECT attachment_update(
357 %d, -- in_id INT(11),
358 '%s', -- in_mime_type varchar(255),
359 %d, -- in_mime_size int(11),
360 '%s', -- in_created DATETIME,
361 '%s' -- in_mailfort_sig varchar(64)
368 "", // this will be ignored..
369 0, // this will be ingored..
371 this.mysql_escape(this.active_message_x_mailfort_sig)
381 public void replace_attachment(GMime.Multipart parent, GMime.Part attachment)
383 var sid = attachment.get_header("X-strip-id");
384 if (sid != null && sid.length > 0) {
385 GLib.debug("Skip attachment replace - it's already been done");
389 var c = attachment.get_content_object();
391 var filename = attachment.get_filename().replace("/", "-");
392 var fn = GLib.Environment.get_tmp_dir() +
393 "/"+ this.active_name + "."+ filename;
395 var outfile = new GMime.StreamFile.for_path(fn, "w");
396 outfile.set_owner(true);
397 var file_size = (int) c.write_to_stream(outfile);
398 var chksum = this.md5_file(fn);
402 if (file_size == 0) {
404 GLib.debug("ERROR - file size of write to stream returned 0?");
412 var mime_type= attachment.get_content_type().to_string();
413 // at this point we have to do our database magic...
414 //filesize / name / date / checksum / mimetype -- into mailfort should be OK.
420 '%s', -- in_msgid VARCHAR(32),
421 '%s', -- in_checksum VARCHAR(64),
422 '%s', -- in_mime_filename varchar(255)
427 this.mysql_escape(this.active_message_exim_id),
429 this.mysql_escape( attachment.get_filename() ), // what is thsi is invalid?
433 var rs = mysql.use_result();
435 var row = rs.fetch_row();
439 //var create_dir = this.created_dir;
442 if (rs != null && rs.num_rows() > 0) {
444 //create_dir = row[1];
445 //GLib.debug("msgid %s", row[2]);
447 this.used_space_after += file_size;
448 if (StripApplication.opt_is_extracting) {
449 target_fn = StripApplication.opt_target_path + "/" + this.created_dir +"/"+ file_id + "-" + filename;
455 GLib.debug("fn = %s, m5=%s, id= %s", filename, mime_type, this.active_message_id);
458 SELECT attachment_update(
459 %d, -- in_id INT(11),
460 '%s', -- in_mime_type varchar(255),
461 %d, -- in_mime_size int(11),
462 '%s', --in_created DATETIME,
463 '%s', -- in_mailfort_sig varchar(64)
497 COALESCE((SELECT id from MailQueue where msgid = '%s' AND message_sig = '%s' AND msgid != '' AND message_sig != '' LIMIT 1),0),
507 COALESCE((SELECT id from email_log where msgid = '%s' AND message_sig = '%s' AND msgid != '' AND message_sig != '' LIMIT 1),0),
521 this.mysql_escape(this.active_message_exim_id), this.mysql_escape(this.active_message_x_mailfort_sig),
526 this.mysql_escape(this.active_message_exim_id), // msgid ?
527 this.mysql_escape(this.active_message_exim_id), this.mysql_escape(this.active_message_x_mailfort_sig), // maillog_id
528 this.created_date, // delivered (from lookup)
532 "", // filled in after we find out the id..
535 this.mysql_escape(filename),
536 this.mysql_escape(mime_type)
543 this.used_space_after += file_size;
545 var file_id_real = this.mysql.insert_id();
546 GLib.debug("Got file_id = %d\n", (int)file_id_real);
547 if (file_id_real == 0) {
548 GLib.Process.exit(1);
550 file_id = "%d".printf((int)file_id_real);
554 if (StripApplication.opt_is_extracting) {
555 target_fn = StripApplication.opt_target_path + "/" + this.created_dir +"/"+ file_id + "-" + filename;
558 var stored = "/" + this.created_dir +"/"+ file_id + "-" + filename;
564 stored_filename = '%s'
568 this.mysql_escape( stored),
574 var rep = new GMime.Part.with_type("text","html");
575 // we have to set up a redirect server - to redirect hpasite... to their internal service..
576 rep.set_filename(filename);
577 var txt = "<html><body>"+
578 "<a href=\"" + StripApplication.opt_replace_link + "/" +
579 file_id + "/" + create_dir + "/"+chksum+"/"+ GLib.Uri.escape_string( filename) +"\">" +
580 GLib.Uri.escape_string( filename) + // fixme needs html escaping...
584 rep.get_content_type().set_parameter("charset", "utf-8");
585 rep.set_header("X-strip-id", file_id);
586 rep.set_header("X-strip-content-name", filename);
587 rep.set_header("X-strip-path", create_dir + "/" + file_id + "-" + filename);
588 rep.set_header("X-strip-content-type", mime_type);
589 var stream = new GMime.StreamMem.with_buffer(txt.data);
590 var con = new GMime.DataWrapper.with_stream(stream,GMime.ContentEncoding.DEFAULT);
592 rep.set_content_object(con);
593 GLib.debug("Replacing Attachment with HTML");
594 parent.replace(parent.index_of(attachment), rep);
595 this.has_replaced = true;
597 if (StripApplication.opt_is_extracting && target_fn.length > 0) {
598 var dir = GLib.Path.get_dirname(target_fn);
599 if (!FileUtils.test (dir, FileTest.IS_DIR)) {
600 GLib.DirUtils.create_with_parents(dir, 0755);
602 GLib.debug("Creating file %s", target_fn);
603 if (!FileUtils.test (target_fn, FileTest.EXISTS)) {
604 Posix.link(fn, target_fn);
607 GLib.debug("Skipping extraction %s", target_fn);
615 public int query(string str)
617 if (StripApplication.opt_debug_sql) {
618 GLib.debug("SQL: %s\n", str);
621 var rc= this.mysql.query(str);
624 GLib.debug("ERROR %u: Query failed: %s\n", this.mysql.errno(), this.mysql.error());
629 public string mysql_escape(string str)
631 unichar[] value_escaped = new unichar[str.length * 2 + 1];
632 this.mysql.real_escape_string ((string) value_escaped, str, str.length);
633 return (string) value_escaped;
636 public string md5_file(string fn) {
637 Checksum checksum = new Checksum (ChecksumType.MD5);
639 FileStream stream = FileStream.open (fn, "rb");
643 while ((size = stream.read (fbuf)) > 0) {
644 checksum.update (fbuf, size);
647 unowned string digest = checksum.get_string ();
651 string active_path = "";
652 string active_name = "";
653 string active_message_id = "";
654 string active_message_x_mailfort_sig = "";
655 string active_message_exim_id = "";
656 bool has_replaced = false;
657 string created_date = ""; // should be YYYY-mm-dd
658 string created_dir = ""; // should be YYY/mm/dd
660 public void scan_file(string path, string name)
662 GLib.debug("Scan: %s/%s", path,name);
664 this.has_replaced = false;
665 this.active_path = path;
666 this.active_name = name;
667 this.active_message_id = "";
669 this.created_dir = this.active_path.substring(this.base_dir.length + 1 );
670 this.created_date = this.created_dir.replace("/", "-");
672 var file_size = (int) File.new_for_path(path +"/" + name)
673 .query_info(GLib.FileAttribute.STANDARD_SIZE,GLib.FileQueryInfoFlags.NONE,null)
676 this.used_space_before += file_size;
678 var stream = new GMime.StreamFs.for_path (path +"/" + name,Posix.O_RDONLY, 0);
679 //stream.set_owner(true);
680 var parser = new GMime.Parser.with_stream(stream);
681 var message = parser.construct_message();
683 if (message == null) {
684 GLib.debug("Could not parse file? %s/%s", path,name);
685 this.used_space_after += file_size;
690 // check : - is message over a year old?
691 // get various msg info..
692 this.active_message_id = message.get_message_id();
693 this.active_message_x_mailfort_sig = message.get_header("x-mailfort-sig");
694 var recvd = message.get_header("received");
695 this.active_message_exim_id = "";
696 if (recvd != null && recvd.length > 1) {
697 GLib.debug("RECV: %s", recvd);
698 var lines = recvd.split("\t");
699 for (var i = 0; i < lines.length;i++) {
700 var bits = lines[i].strip().split(" ");
701 if (bits[0] == "id") {
702 this.active_message_exim_id = bits[1];
706 GLib.debug("Message DATA:\n mid: %s\nmailfort: %s \nexim_id: %s",
707 this.active_message_id,
708 this.active_message_x_mailfort_sig,
709 this.active_message_exim_id
715 var mp = message.get_mime_part();
717 if (!(mp is GMime.Multipart)) {
718 GLib.debug("get mimepart does not return a Multipart?");
719 this.used_space_after += file_size;
723 var mpc = ((GMime.Multipart)mp).get_count();
725 //GLib.debug("Message has %d parts", mpc);
726 for (var i =0 ; i < mpc; i++) {
727 //GLib.debug("Getting part %d", i);
728 var mime_obj = ((GMime.Multipart)mp).get_part(i);
729 this.handle_part(mp,mime_obj);
734 // stream.set_owner(false);
736 stream = null;//.close();
739 if (!this.has_replaced) {
740 this.used_space_after += file_size;
741 GLib.debug("skpping write file - no replacement occured");
745 GMime.Stream outstream = new GMime.StreamNull();
746 if (StripApplication.opt_is_replacing) {
748 tmpfile = GLib.Environment.get_tmp_dir() +"/" + name;
749 outstream = new GMime.StreamFile.for_path (tmpfile,"w");
750 ((GMime.StreamFile)outstream).set_owner(true);
752 if (StripApplication.opt_dump) {
753 outstream = new GMime.StreamMem();
756 file_size = (int) message.write_to_stream(outstream);
757 if (StripApplication.opt_is_replacing) {
758 ((GMime.StreamFile)outstream).set_owner(false);
760 if (StripApplication.opt_dump) {
761 var ua = ((GMime.StreamMem)outstream).get_byte_array().data;
762 print("%s\n", (string) ua);
767 GLib.debug("finished writing output %d", file_size);
772 this.used_space_after += file_size;
775 if (StripApplication.opt_is_replacing) {
776 Posix.unlink(path +"/" + name);
777 GLib.debug("copy tmp file %s to %s" , tmpfile, path +"/" + name);
778 Posix.link(tmpfile, path +"/" + name);
779 Posix.unlink(tmpfile);
783 if (StripApplication.opt_limit > -1 && this.processed >= StripApplication.opt_limit) {
784 GLib.debug("Reached replacement limit");
794 public void scan_dir(string path)
796 var f = File.new_for_path(path);
797 FileEnumerator file_enum;
798 var cancellable = new Cancellable ();
800 file_enum = f.enumerate_children(
801 FileAttribute.STANDARD_DISPLAY_NAME + "," + FileAttribute.STANDARD_TYPE,
802 FileQueryInfoFlags.NOFOLLOW_SYMLINKS, // FileQueryInfoFlags.NONE,
806 GLib.debug("Got error scanning dir? %s", e.message);
807 // FIXME - show error..
812 while (cancellable.is_cancelled () == false ) {
814 next_file = file_enum.next_file (cancellable);
816 GLib.debug("error getting next file? %s", e.message);
820 if (next_file == null) {
825 if (next_file.get_file_type() != FileType.DIRECTORY) {
827 if (next_file.get_display_name()[0] == ',') {
831 this.scan_file(path , next_file.get_display_name());
832 this.report_state("After scanning %s/%s".printf(path , next_file.get_display_name()));
837 //stdout.printf("Monitor.monitor: got file %s : type :%u\n",
838 // next_file.get_display_name(), next_file.get_file_type());
841 if (next_file.get_is_symlink()) {
846 var ds = next_file.get_display_name();
851 if (ds == "attachments") {
856 var sp = path+"/"+next_file.get_display_name();
858 //print("got a file : " + sp);
869 void report_state(string msg)
871 // Saved: 2G Original 10G : 20%
872 GLib.debug("Saved : %s (%.1f%%) | Original %s | %s",
873 GLib.format_size(this.used_space_before - this.used_space_after),
874 100f * ((1f * (this.used_space_before - this.used_space_after)) / (this.used_space_before * 1f)),
875 GLib.format_size(this.used_space_before),