2 * @class Roo.htmleditor.FilterWord
3 * try and clean up all the mess that Word generates.
5 * This is the 'nice version' - see 'Heavy' that white lists a very short list of elements, and multi-filters
8 * Run a new Span Filter
9 * @param {Object} config Configuration options
12 Roo.htmleditor.FilterWord = function(cfg)
14 // no need to apply config.
15 this.replaceDocBullets(cfg.node);
17 this.replaceAname(cfg.node);
18 // this is disabled as the removal is done by other filters;
19 // this.walk(cfg.node);
20 this.replaceImageTable(cfg.node);
24 Roo.extend(Roo.htmleditor.FilterWord, Roo.htmleditor.Filter,
30 * Clean up MS wordisms...
32 replaceTag : function(node)
35 // no idea what this does - span with text, replaceds with just text.
37 node.nodeName == 'SPAN' &&
38 !node.hasAttributes() &&
39 node.childNodes.length == 1 &&
40 node.firstChild.nodeName == "#text"
42 var textNode = node.firstChild;
43 node.removeChild(textNode);
44 if (node.getAttribute('lang') != 'zh-CN') { // do not space pad on chinese characters..
45 node.parentNode.insertBefore(node.ownerDocument.createTextNode(" "), node);
47 node.parentNode.insertBefore(textNode, node);
48 if (node.getAttribute('lang') != 'zh-CN') { // do not space pad on chinese characters..
49 node.parentNode.insertBefore(node.ownerDocument.createTextNode(" ") , node);
52 node.parentNode.removeChild(node);
53 return false; // dont do chidren - we have remove our node - so no need to do chdhilren?
58 if (node.tagName.toLowerCase().match(/^(style|script|applet|embed|noframes|noscript)$/)) {
59 node.parentNode.removeChild(node);
60 return false; // dont do chidlren
62 //Roo.log(node.tagName);
63 // remove - but keep children..
64 if (node.tagName.toLowerCase().match(/^(meta|link|\\?xml:|st1:|o:|v:|font)/)) {
65 //Roo.log('-- removed');
66 while (node.childNodes.length) {
67 var cn = node.childNodes[0];
69 node.parentNode.insertBefore(cn, node);
70 // move node to parent - and clean it..
71 if (cn.nodeType == 1) {
76 node.parentNode.removeChild(node);
77 /// no need to iterate chidlren = it's got none..
78 //this.iterateChildren(node, this.cleanWord);
79 return false; // no need to iterate children.
82 if (node.className.length) {
84 var cn = node.className.split(/\W+/);
86 Roo.each(cn, function(cls) {
87 if (cls.match(/Mso[a-zA-Z]+/)) {
92 node.className = cna.length ? cna.join(' ') : '';
94 node.removeAttribute("class");
98 if (node.hasAttribute("lang")) {
99 node.removeAttribute("lang");
102 if (node.hasAttribute("style")) {
104 var styles = node.getAttribute("style").split(";");
106 Roo.each(styles, function(s) {
110 var kv = s.split(":");
111 if (kv[0].match(/^(mso-|line|font|background|margin|padding|color)/)) {
114 // what ever is left... we allow.
117 node.setAttribute("style", nstyle.length ? nstyle.join(';') : '');
118 if (!nstyle.length) {
119 node.removeAttribute('style');
122 return true; // do children
128 styleToObject: function(node)
130 var styles = (node.getAttribute("style") || '').split(";");
132 Roo.each(styles, function(s) {
136 var kv = s.split(":");
138 // what ever is left... we allow.
139 ret[kv[0].trim()] = kv[1];
145 replaceAname : function (doc)
147 // replace all the a/name without..
148 var aa = Array.from(doc.getElementsByTagName('a'));
149 for (var i = 0; i < aa.length; i++) {
151 if (a.hasAttribute("name")) {
152 a.removeAttribute("name");
154 if (a.hasAttribute("href")) {
157 // reparent children.
158 this.removeNodeKeepChildren(a);
168 replaceDocBullets : function(doc)
170 // this is a bit odd - but it appears some indents use ql-indent-1
171 //Roo.log(doc.innerHTML);
173 var listpara = Array.from(doc.getElementsByClassName('MsoListParagraphCxSpFirst'));
174 for( var i = 0; i < listpara.length; i ++) {
175 listpara[i].className = "MsoListParagraph";
178 listpara = Array.from(doc.getElementsByClassName('MsoListParagraphCxSpMiddle'));
179 for( var i = 0; i < listpara.length; i ++) {
180 listpara[i].className = "MsoListParagraph";
182 listpara = Array.from(doc.getElementsByClassName('MsoListParagraphCxSpLast'));
183 for( var i = 0; i < listpara.length; i ++) {
184 listpara[i].className = "MsoListParagraph";
186 listpara = Array.from(doc.getElementsByClassName('ql-indent-1'));
187 for( var i = 0; i < listpara.length; i ++) {
188 listpara[i].className = "MsoListParagraph";
191 // this is a bit hacky - we had one word document where h2 had a miso-list attribute.
192 var htwo = Array.from(doc.getElementsByTagName('h2'));
193 for( var i = 0; i < htwo.length; i ++) {
194 if (htwo[i].hasAttribute('style') && htwo[i].getAttribute('style').match(/mso-list:/)) {
195 htwo[i].className = "MsoListParagraph";
198 listpara = Array.from(doc.getElementsByClassName('MsoNormal'));
199 for( var i = 0; i < listpara.length; i ++) {
200 if (listpara[i].hasAttribute('style') && listpara[i].getAttribute('style').match(/mso-list:/)) {
201 listpara[i].className = "MsoListParagraph";
203 listpara[i].className = "MsoNormalx";
207 listpara = doc.getElementsByClassName('MsoListParagraph');
208 // Roo.log(doc.innerHTML);
212 while(listpara.length) {
214 this.replaceDocBullet(listpara.item(0));
221 replaceDocBullet : function(p)
223 // gather all the siblings.
225 parent = p.parentNode,
226 doc = parent.ownerDocument,
229 //Roo.log("Parsing: " + p.innerText) ;
232 if (ns.nodeType != 1) {
236 if (!ns.className.match(/(MsoListParagraph|ql-indent-1)/i)) {
237 //Roo.log("Missing para r q1indent - got:" + ns.className);
240 var spans = ns.getElementsByTagName('span');
242 if (ns.hasAttribute('style') && ns.getAttribute('style').match(/mso-list/)) {
251 for (var i = 0; i < spans.length;i++) {
253 if (se.hasAttribute('style') && se.hasAttribute('style') && se.style.fontFamily != '') {
254 ff = se.style.fontFamily;
260 //Roo.log("got font family: " + ff);
261 if (typeof(ff) != 'undefined' && !ff.match(/(Symbol|Wingdings)/) && "·o".indexOf(se.innerText.trim()) < 0) {
267 //Roo.log("no mso-list?");
269 var spans = ns.getElementsByTagName('span');
273 var has_list = false;
274 for(var i = 0; i < spans.length; i++) {
275 if (spans[i].hasAttribute('style') && spans[i].getAttribute('style').match(/mso-list/)) {
293 var ul = parent.ownerDocument.createElement(listtype); // what about number lists...
294 parent.insertBefore(ul, p);
299 var margin_to_depth = {};
302 items.forEach(function(n, ipos) {
303 //Roo.log("got innertHMLT=" + n.innerHTML);
305 var spans = n.getElementsByTagName('span');
307 //Roo.log("No spans found");
309 parent.removeChild(n);
312 return; // skip it...
318 for(var i = 0; i < spans.length; i++) {
320 style = this.styleToObject(spans[i]);
321 if (typeof(style['mso-list']) == 'undefined') {
324 if (listtype == 'ol') {
325 num = spans[i].innerText.replace(/[^0-9]+]/g,'') * 1;
327 spans[i].parentNode.removeChild(spans[i]); // remove the fake bullet.
330 //Roo.log("NOW GOT innertHMLT=" + n.innerHTML);
331 style = this.styleToObject(n); // mo-list is from the parent node.
332 if (typeof(style['mso-list']) == 'undefined') {
333 //Roo.log("parent is missing level");
335 parent.removeChild(n);
340 var margin = style['margin-left'];
341 if (typeof(margin_to_depth[margin]) == 'undefined') {
343 margin_to_depth[margin] = max_margins;
345 nlvl = margin_to_depth[margin] ;
349 var nul = doc.createElement(listtype); // what about number lists...
351 last_li = doc.createElement('li');
352 stack[lvl].appendChild(last_li);
354 last_li.appendChild(nul);
360 // not starting at 1..
361 if (!stack[nlvl].hasAttribute("start") && listtype == "ol") {
362 stack[nlvl].setAttribute("start", num);
365 var nli = stack[nlvl].appendChild(doc.createElement('li'));
367 nli.innerHTML = n.innerHTML;
368 //Roo.log("innerHTML = " + n.innerHTML);
369 parent.removeChild(n);
381 replaceImageTable : function(doc)
384 <table cellpadding=0 cellspacing=0 align=left>
386 <td width=423 height=0></td>
390 <td><img width=601 height=401
391 src="file:///C:/Users/Alan/AppData/Local/Temp/msohtmlclip1/01/clip_image002.jpg"
392 v:shapes="Picture_x0020_2"></td>
396 var imgs = Array.from(doc.getElementsByTagName('img'));
397 Roo.each(imgs, function(img) {
398 var td = img.parentNode;
399 if (td.nodeName != 'TD') {
402 var tr = td.parentNode;
403 if (tr.nodeName != 'TR') {
406 var tbody = tr.parentNode;
407 if (tbody.nodeName != 'TBODY') {
410 var table = tbody.parentNode;
411 if (table.nodeName != 'TABLE') {
416 if (table.getElementsByTagName('tr').length != 2) {
419 if (table.getElementsByTagName('td').length != 3) {
422 if (table.innerText.trim() != '') {
425 var p = table.parentNode;
426 img.parentNode.removeChild(img);
427 p.insertBefore(img, table);
428 p.removeChild(table);