做采集经常遇到的问题是内容排版问题,用了一些时间写了个用正则替换html标签和样式的函数,共享下。
/**<BR> * 格式化内容<BR> * @param string $content 内容最好统一用utf-8编码<BR> * @return string<BR> * !本函数需要开启tidy扩展<BR> */<BR>function removeFormat($content) {<BR> $replaces = array (<BR> "/<font>/i" => '',<BR> "/<\/font>/i" => '',<BR> "/<strong>/i" => '',<BR> "/<\/strong>/i" => '',<BR> "/<span>/i" => '',<BR> "/<\/span>/i" => '',<BR> "/<div>/i" => "<p>",<BR> "/<\/div>/i" => "</p>",<BR> "/<!---ecms -ecms *-->/i"=>'',<BR> /* "/<table>/i" => '',//遇到有表格的内容就不要启用<BR> "/<\/table>/i" => '',<BR> "/<tbody>/i" => '',<BR> "/<\/tbody>/i" => '',<BR> "/<tr>/i" => '<p>',<BR> "/<\/tr>/i" => '</p>',<BR> "/<td>/i" => '', */<BR> "/style=.+?['|\"]/i" => '',<BR> "/class=.+?['|\"]/i" => '',<BR> "/id=.+?['|\"]/i"=>'',<BR> "/lang=.+?['|\"]/i"=>'',<BR> //"/width=.+?['|\"]/i"=>'',//不好控制注释掉<BR> //"/height=.+?['|\"]/i"=>'',<BR> "/border=.+?['|\"]/i"=>'',<BR> "/face=.+?['|\"]/i"=>'',<BR> "/<br>[ ]*/i" => "</p><p>",<BR> "/<iframe.*?>.<mark style="color:transparent">来4源gaodaimacom搞#代%码*网</mark><code>搞代gaodaima码</code>*<\/iframe>/i" => '',<BR> "/ /i" => ' ',//空格替换掉<BR> "/<p>[ |\x{3000}|\r\n]*/ui" => '<p> ',//替换半角、全角空格,换行符,用 排除写入数据库时产生的编码问题<br><br> );<BR> $config = array(<BR> //'indent' => TRUE, //是否缩进 <BR> 'output-html' => TRUE,//是否是输出xhtml <BR> 'show-body-only'=>TRUE,//是否只获得到body <BR> 'wrap' => 0<BR> );<BR> $content = tidy_repair_string($content, $config, 'utf8');//先利用php自带的tidy类库修复html标签,不然替换的时候容易出现各种诡异的情况<BR> $content = trim($content);<BR> foreach ( $replaces as $k => $v ) {<BR> $content = preg_replace ( $k, $v, $content );<BR> }<br><br> if(strpos($content,'<p>')>6)//部分内容开头可能缺失<p>标签<BR> $content = '<p> '.$content;<br><br> $content = tidy_repair_string($content, $config, 'utf8');//再修复一次,可以去除html空标签<BR> $content = trim($content);<BR> return $content;<BR>}<BR>