• 欢迎访问搞代码网站,推荐使用最新版火狐浏览器和Chrome浏览器访问本网站!
  • 如果您觉得本站非常有看点,那么赶紧使用Ctrl+D 收藏搞代码吧

PHP制作百度词典查词采集器_PHP

php 搞代码 4年前 (2022-01-25) 34次浏览 已收录 0个评论

百度dict 采集样本

写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.php放出来,项目地址http://github.com/widuu/baidu_dict,有需要的直接fork就可以了~么么哒,这东西用的人很少,所以有用的兄弟拿走了哈~

 音标	 *				"pro"	 => 发音	 *				"example"=> 例句	 *				"explain"=> 简明释义	 *				"synonym"=> 同反义词	 *				"phrase" => 短语数组	 *			)   *	 */	public function content($word){		 $this -> word = $word;		 $symbol = $this -> Pronounced();		 $pro	 = $this->getSay();		 $example = $this -> getExample();		 $explain = $this -> getExplain();		 $synonym = $this -> getSynonym();		 $phrase = $this -> getPhrase();		 $result = array(				"symbol" => $symbol,		//音标				"pro"	 => $pro,			//发音				"example"=> $example,		//例句				"explain"=> $explain,		//简明释义				"synonym"=> $synonym,		//同反义词				"phrase" => $phrase 		//短语数组			);		return $result;	}	/**   * 远程获取百度翻译内容   * get function curl   * retun string   *	 */	private function getContent(){ 		$useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0"; 		$ch = curl_init(); 		$url = "http://dict.baidu.com/s?wd=".$this->word; 		curl_setopt($ch, CURLOPT_URL, $url); 		curl_setopt($ch, CURLOPT_USERAGENT,$useragent);		curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); 		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 		curl_setopt($ch, CURLOPT_HTTPGET, 1);		curl_setopt($ch, CURLOPT_AUTOREFERER,1);		curl_setopt($ch, CURLOPT_HEADER, 0); 		curl_setopt($ch, CURLOPT_TIMEOUT, 30);		$result = curl_exec($ch);		if (curl_errno($curl)) {			echo 'Errno'.curl_error($curl);		}		curl_close($ch);		return $result;	}	/**   * 获取百度翻译发音   * retun array(英,美)   *	 */	private function Pronounced(){		$data = $this -> getContent();		preg_match_all("/\"EN\-US\"\>(.*)\<\/b\>/Ui",$data,$pronounced);		return array(			'en' => $pronounced[1][0],			'us' => $pronounced[1][1]		);	}	/**	 * 获取百度翻译发音	 * return array(英,美)	 *	 */	private function getSay(){		$data = $this -> getContent();		preg_match_all("/url=\"(.*)\"/Ui",$data,$pronounced);		return array(			'en' => $pronounced[1][0],			'us' => $pronounced[1][1]		);		}	/**   * 获取百度翻译例句   * return array() 多维数组 例句   * 	 */	private function getExample(){		$str = "";		$data = $this -> getContent();		preg_match_all("/var example_data = (.*)\]\;/Us",$data,$example);	  $data1 = "[[[".ltrim($example[1][0],"[");	  $data2 = explode("[[[",$data1);	  $num = count(array_filter($data2));		foreach($data2 as $key => $value){		 	$data3 = explode("[[","[[".$value);		 	foreach ($data3 as $k => $v) {		 		preg_match_all("/\[\"(.*)\",/Us","[".$v, $match);		 		if(!empty($match[1])){		 			$str .= implode($match[1]," ")."@";		 		}		 	}		}		$data4 = trim($str,"@");		$data5 = explode("@", $data4);		$result = array_chunk($data5, 2);		return $result;	}	/**   * 获取简明释义   * return array (x => "词性",b => "附属")   * 	 **/	private function getExplain(){		$data = $this -> getContent();		preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\<div(\s+)class\=\"source\"\>/Us",$data,$explain);		$r_data = $explain[1][0];		preg_match_all("/\<p\>\<strong\>(?P.*)\<\/strong\>\<span\>(?P.*)\<\/span\>\<\/p\>/Us", $r_data, $a_data);		<b style="color:transparent">来&源gao@dai!ma.com搞$代^码%网</b><img>搞gaodaima代码</img>preg_match_all("/\<span\>(?P[^\>]+)\:\.*)\<\/a\>\<\/span\>/Us", $r_data, $b_data);				$result = array();		foreach ($a_data["adj"] as $key => $value) {			$result[$value] = $a_data["name"][$key];		}				$word_b = array();		foreach ($b_data["tag"] as $key => $value) {			$word_b[$value] = strip_tags($b_data["word"][$key]);		}				$result_data = array("x" => $result,"b" => $word_b); 		return $result_data;	}	/**   * 获取同义词   * return array(0 => "同义词", 1 => "反义词") 一般为多维数组   * 	 */	private function getSynonym(){		$data = $this -> getContent();		preg_match_all("/id=\"en\-syn\-ant\"\>(.*)<div>/Us",$data,$synonym);		$content = $synonym[1][0];		$data1 = explode("</dl>", $content);		$result = array();		$data2 = array();		foreach ($data1 as $key => $value) {			preg_match_all("/\<strong\>(?P.*)\&nbsp\;\<\/strong\>\<\/div\>\<div(\s+)class\=\"syn\-ant\-list\"\>\<ul\>(?.*)\<\/ul\>/Us", $value, $r_data);			$data2[$key]["adj"] = $r_data["adj"];			$data2[$key]["content"] = $r_data["content"];		}		foreach ($data2 as $key => $value) {			foreach ($value["content"] as $k => $v) {				if(!empty($v)){					preg_match_all("/\<li\>\<p\>(?P<title>.*)\<\/p\>(?P.*)\<\/li>/Us", $v, $v_data);					foreach ($v_data['title'] as $m => $d) {						$data = strip_tags(preg_replace(""," ", $v_data["value"][$m]));						$result[$key][$value["adj"][$k]][$d] = $data;					}				}			}		} 		return $result;	}	/**   * 获取短语词组   * return array (key => value) 一维或者多维数组   * 	 */	private function getPhrase(){		$num = self::$num;		$data = $this -> getContent();		preg_match_all("/id=\"en\-phrase\"\>(.*)\<div class\=\"source\"\>/Us",$data,$phrase);		$data = explode("</dd>",$phrase[1][0]);		$data1 = array_slice($data,0,$num);		$result = array();		foreach ($data1 as $key => $value) {			$data2 = explode("</p>", $value);			$n = count($data2);			if($n $value) {					foreach ($value as $k => $v) {						$value[$k] = strip_tags($v);					}					$array = array($result[$key_value],$value);					if (array_key_exists($key_value, $result)){						$result[$key_value] = $array;					}				}							}		}		return $result;	}	/**	 * 将数组转换为字符串	 *	 * @param  array  $data    数组	 * @param  bool  $isformdata 如果为0,则不使用new_stripslashes处理,可选参数,默认为1	 * @return  string 返回字符串,如果,data为空,则返回空	 */	private function array2string($data, $isformdata = 1) {	  if($data == '') return '';	  if($isformdata) $data = $this->new_stripslashes($data);	  return addslashes(var_export($data, TRUE));	}	/**	 * 返回经stripslashes处理过的字符串或数组	 * @param $string 需要处理的字符串或数组	 * @return mixed	 */	private function new_stripslashes($string) {	  if(!is_array($string)) return stripslashes($string);	  foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val);	  return $string;	}}// $word = new dict("express");// $word ->content();

以上就是本文的全部内容了,非常实用的功能,希望小伙伴们能够喜欢。


搞代码网(gaodaima.com)提供的所有资源部分来自互联网,如果有侵犯您的版权或其他权益,请说明详细缘由并提供版权或权益证明然后发送到邮箱[email protected],我们会在看到邮件的第一时间内为您处理,或直接联系QQ:872152909。本网站采用BY-NC-SA协议进行授权
转载请注明原文链接:PHP制作百度词典查词采集器_PHP
喜欢 (0)
[搞代码]
分享 (0)
发表我的评论
取消评论

表情 贴图 加粗 删除线 居中 斜体 签到

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址