2 * @class Roo.htmleditor.FilterWord
3 * try and clean up all the mess that Word generates.
5 * This is the 'nice version' - see 'Heavy' that white lists a very short list of elements, and multi-filters
8 * Run a new Span Filter
9 * @param {Object} config Configuration options
12 Roo.htmleditor.FilterWord = function(cfg)
14 // no need to apply config.
15 this.replaceDocBullets(cfg.node);
17 // this is disabled as the removal is done by other filters;
18 // this.walk(cfg.node);
23 Roo.extend(Roo.htmleditor.FilterWord, Roo.htmleditor.Filter,
29 * Clean up MS wordisms...
31 replaceTag : function(node)
34 // no idea what this does - span with text, replaceds with just text.
36 node.nodeName == 'SPAN' &&
37 !node.hasAttributes() &&
38 node.childNodes.length == 1 &&
39 node.firstChild.nodeName == "#text"
41 var textNode = node.firstChild;
42 node.removeChild(textNode);
43 if (node.getAttribute('lang') != 'zh-CN') { // do not space pad on chinese characters..
44 node.parentNode.insertBefore(node.ownerDocument.createTextNode(" "), node);
46 node.parentNode.insertBefore(textNode, node);
47 if (node.getAttribute('lang') != 'zh-CN') { // do not space pad on chinese characters..
48 node.parentNode.insertBefore(node.ownerDocument.createTextNode(" ") , node);
51 node.parentNode.removeChild(node);
52 return false; // dont do chidren - we have remove our node - so no need to do chdhilren?
57 if (node.tagName.toLowerCase().match(/^(style|script|applet|embed|noframes|noscript)$/)) {
58 node.parentNode.removeChild(node);
59 return false; // dont do chidlren
61 //Roo.log(node.tagName);
62 // remove - but keep children..
63 if (node.tagName.toLowerCase().match(/^(meta|link|\\?xml:|st1:|o:|v:|font)/)) {
64 //Roo.log('-- removed');
65 while (node.childNodes.length) {
66 var cn = node.childNodes[0];
68 node.parentNode.insertBefore(cn, node);
69 // move node to parent - and clean it..
70 if (cn.nodeType == 1) {
75 node.parentNode.removeChild(node);
76 /// no need to iterate chidlren = it's got none..
77 //this.iterateChildren(node, this.cleanWord);
78 return false; // no need to iterate children.
81 if (node.className.length) {
83 var cn = node.className.split(/\W+/);
85 Roo.each(cn, function(cls) {
86 if (cls.match(/Mso[a-zA-Z]+/)) {
91 node.className = cna.length ? cna.join(' ') : '';
93 node.removeAttribute("class");
97 if (node.hasAttribute("lang")) {
98 node.removeAttribute("lang");
101 if (node.hasAttribute("style")) {
103 var styles = node.getAttribute("style").split(";");
105 Roo.each(styles, function(s) {
109 var kv = s.split(":");
110 if (kv[0].match(/^(mso-|line|font|background|margin|padding|color)/)) {
113 // what ever is left... we allow.
116 node.setAttribute("style", nstyle.length ? nstyle.join(';') : '');
117 if (!nstyle.length) {
118 node.removeAttribute('style');
121 return true; // do children
127 styleToObject: function(node)
129 var styles = (node.getAttribute("style") || '').split(";");
131 Roo.each(styles, function(s) {
135 var kv = s.split(":");
137 // what ever is left... we allow.
138 ret[kv[0].trim()] = kv[1];
143 replaceDocBullets : function(doc)
145 // this is a bit odd - but it appears some indents use ql-indent-1
146 //Roo.log(doc.innerHTML);
148 var listpara = doc.getElementsByClassName('MsoListParagraphCxSpFirst');
149 for( var i = 0; i < listpara.length; i ++) {
150 listpara.item(i).className = "MsoListParagraph";
152 // this is a bit hacky - we had one word document where h2 had a miso-list attribute.
153 var htwo = doc.getElementsByTagName('h2');
154 for( var i = 0; i < htwo.length; i ++) {
155 if (htwo.item(i).hasAttribute('style') && htwo.item(i).getAttribute('style').match(/mso-list:/)) {
156 htwo.item(i).className = "MsoListParagraph";
159 listpara = doc.getElementsByClassName('MsoNormal');
160 while(listpara.length) {
161 if (listpara.item(0).hasAttribute('style') && listpara.item(0).getAttribute('style').match(/mso-list:/)) {
162 listpara.item(0).className = "MsoListParagraph";
164 listpara.item(0).className = "MsoNormalx";
167 listpara = doc.getElementsByClassName('ql-indent-1');
168 while(listpara.length) {
169 this.replaceDocBullet(listpara.item(0));
171 listpara = doc.getElementsByClassName('MsoListParagraph');
172 while(listpara.length) {
174 this.replaceDocBullet(listpara.item(0));
181 replaceDocBullet : function(p)
183 // gather all the siblings.
185 parent = p.parentNode,
186 doc = parent.ownerDocument,
191 if (ns.nodeType != 1) {
195 if (!ns.className.match(/(MsoListParagraph|ql-indent-1)/i)) {
198 if (ns.getAttribute('style').match(/mso-list/)) {
204 var spans = ns.getElementsByTagName('span');
208 var has_list = false;
209 for(var i = 0; i < spans.length; i++) {
210 if (spans[i].getAttribute('style').match(/mso-list/)) {
228 var ul = parent.ownerDocument.createElement('ul'); // what about number lists...
229 parent.insertBefore(ul, p);
234 var margin_to_depth = {};
237 items.forEach(function(n, ipos) {
238 //Roo.log("got innertHMLT=" + n.innerHTML);
240 var spans = n.getElementsByTagName('span');
242 //Roo.log("No spans found");
244 parent.removeChild(n);
247 return; // skip it...
253 for(var i = 0; i < spans.length; i++) {
255 style = this.styleToObject(spans[i]);
256 if (typeof(style['mso-list']) == 'undefined') {
260 spans[i].parentNode.removeChild(spans[i]); // remove the fake bullet.
263 //Roo.log("NOW GOT innertHMLT=" + n.innerHTML);
264 style = this.styleToObject(n); // mo-list is from the parent node.
265 if (typeof(style['mso-list']) == 'undefined') {
266 //Roo.log("parent is missing level");
268 parent.removeChild(n);
273 var margin = style['margin-left'];
274 if (typeof(margin_to_depth[margin]) == 'undefined') {
276 margin_to_depth[margin] = max_margins;
278 nlvl = margin_to_depth[margin] ;
282 var nul = doc.createElement('ul'); // what about number lists...
284 last_li = doc.createElement('li');
285 stack[lvl].appendChild(last_li);
287 last_li.appendChild(nul);
293 var nli = stack[nlvl].appendChild(doc.createElement('li'));
295 nli.innerHTML = n.innerHTML;
296 //Roo.log("innerHTML = " + n.innerHTML);
297 parent.removeChild(n);