Skip to content

Instantly share code, notes, and snippets.

@TooBug
Created March 5, 2018 06:42
Show Gist options
  • Save TooBug/da708e7b50cc5a8ddc121aa393d8f307 to your computer and use it in GitHub Desktop.
Save TooBug/da708e7b50cc5a8ddc121aa393d8f307 to your computer and use it in GitHub Desktop.
clean word tags
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>cleanWord</title>
<style>
html,body{
height:100%;
margin:0;
padding:0;
}
#source,#sourceCode,#result,#preview{
display:inline-block;
margin:5px;
border:1px solid #ccc;
width:22%;
height:80%;
vertical-align: top;
overflow: auto;
}
</style>
</head>
<body>
<div>
<label><input type="checkbox" id="deep" checked />深度清理</label>
</div>
<div id="source" contenteditable></div>
<textarea id="sourceCode"></textarea>
<textarea id="result"></textarea>
<div id="preview"></div>
<script src="http://code.jquery.com/jquery-2.1.3.min.js"></script>
<script>
/*global $*/
/*jshint strict:false*/
function doClean(){
setTimeout(function(){
$('#sourceCode').val($('#source').html());
var html = cleanHTML($('#source').html());
$('#result').val(html);
$('#preview').html(html);
},0)
}
$('#source').on('paste',doClean);
$('#deep').on('change',doClean);
function cleanHTML(sHtml)
{
var isIE = false;
var editorRoot = '/';
var cleanPaste = $('#deep').prop('checked')?2:1;
var imgPlaceholder = 'http://lorempixel.com/200/200/animals/1/placeholder/';
//区块标签清理
sHtml = sHtml.replace(/<!--[\s\S]*?-->|<!(--)?\[[\s\S]+?\](--)?>|<style(\s+[^>]*?)?>[\s\S]*?<\/style>/ig, '');
sHtml = sHtml.replace(/\r?\n/ig, '');
//保留Word图片占位
if(isIE){
sHtml = sHtml.replace(/<v:shapetype(\s+[^>]*)?>[\s\S]*<\/v:shapetype>/ig,'');
sHtml = sHtml.replace(/<v:shape(\s+[^>]+)?>[\s\S]*?<v:imagedata(\s+[^>]+)?>\s*<\/v:imagedata>[\s\S]*?<\/v:shape>/ig,function(all,attr1,attr2){
var match;
match = attr2.match(/\s+src\s*=\s*("[^"]+"|'[^']+'|[^>\s]+)/i);
if(match){
match = match[1].match(/^(["']?)(.*)\1/)[2];
var sImg ='<img src="'+imgPlaceholder+'" _xhe_temp="true" class="wordImage"';
match = attr1.match(/\s+style\s*=\s*("[^"]+"|'[^']+'|[^>\s]+)/i);
if(match){
match = match[1].match(/^(["']?)(.*)\1/)[2];
sImg += ' style="' + match + '"';
}
sImg += ' />';
return sImg;
}
return '';
});
}
else{
sHtml = sHtml.replace(/<img( [^<>]*(v:shapes|msohtmlclip)[^<>]*)\/?>/ig,function(all,attr){
var match,str = '<img src="'+imgPlaceholder+'" _xhe_temp="true" class="wordImage"';
match = attr.match(/ width\s*=\s*"([^"]+)"/i);
if(match)str += ' width="'+match[1]+'"';
match = attr.match(/ height\s*=\s*"([^"]+)"/i);
if(match)str += ' height="'+match[1]+'"';
return str + ' />';
});
}
sHtml=sHtml.replace(/(<(\/?)([\w\-:]+))((?:\s+[\w\-:]+(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^>\s]+))?)*)\s*(\/?>)/g,function(all,left,end,tag,attr,right){
tag=tag.toLowerCase();
if((tag.match(/^(link)$/)&&attr.match(/file:\/\//i))||tag.match(/:/)||(tag==='span'&&cleanPaste===2))return '';
if(!end){
attr=attr.replace(/\s([\w\-:]+)(?:\s*=\s*("[^"]*"|'[^']*'|[^>\s]+))?/ig,function(all,n,v){
n=n.toLowerCase();
if(/:/.test(n))return '';
v=v.match(/^(["']?)(.*)\1/)[2];
if(cleanPaste===1){//简单清理
switch(tag){
case 'p':
if(n === 'style'){
v=v.replace(/"|&quot;/ig,"'").replace(/\s*([^:]+)\s*:\s*(.*?)(;|$)/ig,function(all,n,v){
return /^(text-align)$/i.test(n)?(n+':'+v+';'):'';
}).replace(/^\s+|\s+$/g,'');
return v?(' '+n+'="'+v+'"'):'';
}
break;
case 'span':
if(n === 'style'){
v=v.replace(/"|&quot;/ig,"'").replace(/\s*([^:]+)\s*:\s*(.*?)(;|$)/ig,function(all,n,v){
return /^(color|background|font-size|font-family)$/i.test(n)?(n+':'+v+';'):'';
}).replace(/^\s+|\s+$/g,'');
return v?(' '+n+'="'+v+'"'):'';
}
break;
case 'table':
if(n.match(/^(cellspacing|cellpadding|border|width)$/i))return all;
break;
case 'td':
if(n.match(/^(rowspan|colspan)$/i))return all;
if(n === 'style'){
v=v.replace(/"|&quot;/ig,"'").replace(/\s*([^:]+)\s*:\s*(.*?)(;|$)/ig,function(all,n,v){
return /^(width|height)$/i.test(n)?(n+':'+v+';'):'';
}).replace(/^\s+|\s+$/g,'');
return v?(' '+n+'="'+v+'"'):'';
}
break;
case 'a':
if(n.match(/^(href)$/i))return all;
break;
case 'font':
case 'img':
return all;
break;
}
}
else if(cleanPaste===2){
switch(tag){
case 'td':
if(n.match(/^(rowspan|colspan)$/i))return all;
break;
case 'img':
return all;
}
}
return '';
});
}
return left+attr+right;
});
//空内容的标签
for(var i=0;i<3;i++)sHtml = sHtml.replace( /<([^\s>]+)(\s+[^>]*)?>\s*<\/\1>/g,'');
//无属性的无意义标签
function cleanEmptyTag(all,tag,content){
return content;
}
for(var i=0;i<3;i++)sHtml = sHtml.replace(/<(span|a)>(((?!<\1(\s+[^>]*?)?>)[\s\S]|<\1(\s+[^>]*?)?>((?!<\1(\s+[^>]*?)?>)[\s\S]|<\1(\s+[^>]*?)?>((?!<\1(\s+[^>]*?)?>)[\s\S])*?<\/\1>)*?<\/\1>)*?)<\/\1>/ig,cleanEmptyTag);//第3层
for(var i=0;i<3;i++)sHtml = sHtml.replace(/<(span|a)>(((?!<\1(\s+[^>]*?)?>)[\s\S]|<\1(\s+[^>]*?)?>((?!<\1(\s+[^>]*?)?>)[\s\S])*?<\/\1>)*?)<\/\1>/ig,cleanEmptyTag);//第2层
for(var i=0;i<3;i++)sHtml = sHtml.replace(/<(span|a)>(((?!<\1(\s+[^>]*?)?>)[\s\S])*?)<\/\1>/ig,cleanEmptyTag);//最里层
//合并多个font
for(var i=0;i<3;i++)sHtml = sHtml.replace(/<font(\s+[^>]+)><font(\s+[^>]+)>/ig,function(all,attr1,attr2){
return '<font'+attr1+attr2+'>';
});
//清除表格间隙里的空格等特殊字符
sHtml=sHtml.replace(/(<(\/?)(tr|td)(?:\s+[^>]+)?>)[^<>]+/ig,function(all,left,end,tag){
if(!end&&/^td$/i.test(tag))return all;
else return left;
});
return sHtml;
}
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment