import time
import re
class Segment:
# 数据成员 sentence = "" MaxLen = 0 pos = 0 len = 0 result_MM = "" # 寄存MM分词后果 result_RMM = "" # 寄存RMM分词后果 final_res = "" dict = [] # 构造函数 def __init__(self, sentence, MaxLen): self.sentence = sentence self.MaxLen = MaxLen self.pos = 0 self.len = self.MaxLen self.result_MM = "" self.readDict() # 读字典 def readDict(self): f = open("chineseDic.txt", "r", encoding="utf-8") lines = f.readlines() for line in lines: # print(line) words = line.split(",") self.dict.append(words[0]) # 正向最大匹配 def MM(self, nLen, nPos): length = len(self.sentence) if (nPos > length): return substr = self.sentence[nPos:nPos + nLen] if substr in self.dict: self.result_MM = self.result_MM + substr + "/ " nPos = nPos + nLen nLen = self.MaxLen self.MM(nLen, nPos) elif nLen > 1: nLen = nLen - 1 self.MM(nLen, nPos) else: self.result_MM = self.result_MM + substr + "/ " nPos = nPos + 1 nLen = self.MaxLen self.MM(nLen, nPos) # 逆向最大匹配 def RMM(self, nLen, nPos): if (nPos < 0): return substr = self.sentence[nPos - nLen:nPos] if substr in self.dict: self.result_RMM = self.result_RMM + "/" + substr nPos = nPos - nLen nLen = self.MaxLen self.RMM(nLen, nPos) elif nLen > 1: nLen = nLen - 1 self.RMM(nLen, nPos) else: self.result_RMM = self.result_RMM + substr + "/" nPos = nPos - 1 nLen = self.MaxLen self.RMM(nLen, nPos) def getMMResult(self): return self.result_MM def getRMMResult(self): return self.result_RMM def getFinalResult(self): return self.final_res def printFinalResult(self): print("正向最大匹配后果:") seg_res_MM = self.result_MM.replace(" ", "") print(seg_res_MM) seg_list_MM = seg_res_MM.split('/') del seg_list_MM[-1] # 因为依照'/'宰割,所以最初会多出一个'',删去 print(seg_list_MM) print("逆向最大匹配后果:") seg_res_RMM = self.result_RMM.replace(" ", "") print(seg_res_RMM) seg_list_RMM = list(reversed(seg_res_RMM.split('/'))) del seg_list_RMM[0] del seg_list_RMM[-1] print(seg_list_RMM) len_MM = len(seg_list_MM) len_RMM = len(seg_list_RMM) flag = 1 for i in range(0, min(len_MM, len_RMM)): if seg_list_MM[i] != seg_list_RMM[i]: print("两次分词后果不统一。") flag = 0 break if (flag): print("两次分词后果统一。") print("最终的分词后果为:") self.final_res = self.result_MM print(self.final_res)
def to_region(segmentation):
region = [] start = 1 for word in re.compile("\\s+").split(segmentation.strip()): # 空格,回车,换行等空白符 end = start + len(word) - 2 region.append((start, end)) start = end + 1 return region
def PRF(target, pred):
t_set, p_set = set(target), set(pred) target_num = len(t_set) pred_num = len(p_set) cap_num = len(t_set & p_set) p = cap_num / pred_num r = cap_num / target_num f = 2 * p * r / (p + r) print("P =", p) print("R =", r) print("F1 =", f)
if name == ‘__main__’:
test_str = '在这一年中,中国的改革开放和现代化建设持续向前迈进。国民经济放弃了“高增长、[利率期货](https://www.gendan5.com/ff/if.html)低通胀”的良好倒退态势。农业生产再次取得好的收成,企业改革持续深入,人民生存进一步改善。对外经济技术单干与交换不断扩大。' seg = Segment(test_str, 3) time_start = time.time() seg.MM(3, 0) seg.RMM(3, len(test_str)) time_end = time.time() seg.printFinalResult() print('分词工夫:', time_end - time_start, 's') target_str = "在/ 这/ 一/ 年/ 中/ ,/ 中国/ 的/ 改革/ 凋谢/ 和/ 现代化/ 建设/ 持续/ 向前/ 迈进/ 。/ 国民经济/ 放弃/ 了/ “/ 高/ 增长/ 、/ 低/ 通胀/ ”/ 的/ 良好/ 倒退/ 态势/ 。/ 农业/ 生产/ 再次/ 取得/ 好/ 的/ 收成/ ,/ 企业/ 改革/ 持续/ 深入/ ,/ 人民/ 生存/ 进一步/ 改善/ 。/ 对外/ 经济/ 技术/ 单干/ 与/ 交换/ 一直/ 扩充/ 。/" re_pred = to_region(seg.getFinalResult()) re_target = to_region(target_str) # 每个单词按它在文本中的起止地位可记作区间[i, j] print("分词后果:", re_pred) print("标准答案:", re_target) PRF(re_target, re_pred)