一、准备训练数据
主要的数据有两个:
1.小黄鸡的聊天语料:噪声很大
2.微博的标题和评论:质量相对较高
二、数据的处理和保存
由于数据中存到大量的噪声,可以对其进行基础的处理,然后分别把input和target使用两个文件保存,即input中的第N行尾问,target的第N行为答
后续可能会把单个字作为特征(存放在input_word.txt),也可能会把词语作为特征(input.txt)
2.1 小黄鸡的语料的处理
def format_xiaohuangji_corpus(word=False): """处理小黄鸡的语料""" if word: corpus_path = "./chatbot/corpus/xiaohuangji50w_nofenci.conv" input_path = "./chatbot/corpus/input_word.txt" output_path = "./chatbot/corpus/output_word.txt" else: corpus_path = "./chatbot/corpus/xiaohuangji50w_nofenci.conv" input_path = "./chatbot/corpus/input.txt" output_path = "./chatbot/corpus/output.txt" f_input = open(input_path, "a") f_output = open(output_path, "a") pair = [] for line in tqdm(open(corpus_path), ascii=True): if line.strip() == "E": if not pair: continue else: assert len(pair) == 2, "长度必须是2" if len(pair[0].strip()) >= 1 and len(pair[1].strip()) >= 1: f_input.write(pair[0] + "\n") f_output.write(pair[1] + "\n") pair = [] elif line.startswith("M"): line = line[1:] if word: pair.append(" ".join(list(line.strip()))) else: pair.append(" ".join(jieba_cut(line.strip())))
2.2 微博语料的处理
def format_weibo(word=False): """ 微博数据存在一些噪声,未处理 :return: """ if word: origin_input = "./chatbot/corpus/stc_weibo_train_post" input_path = "./chatbot/corpus/input_word.txt" origin_output = "./chatbot/corpus/stc_weibo_train_response" output_path = "./chatbot/corpus/output_word.txt" else: origin_input = "./chatbot/corpus/stc_weibo_train_post" input_path = "./chatbot/corpus/input.txt" origin_output = "./chatbot/corpus/stc_weibo_train_response" output_path = "./chatbot/corpus/output.txt" f_input = open(input_path, "a") f_output = open(output_path, "a") with open(origin_input) as in_o, open(origin_output) as out_o: for _in, _out in tqdm(zip(in_o, out_o), ascii=True): _in = _in.strip() _out = _out.strip() if _in.endswith(")") or _in.endswith("」") or _in.endswith(")"): _in = re.sub("(.*)|「.*?」|\(.*?\)", " ", _in) _in = re.sub("我在.*?alink|alink|(.*?\d+x\d+.*?)|#|】|【|-+|_+|via.*?:*.*", " ", _in) _in = re.sub("\s+", " ", _in) if len(_in) < 1 or len(_out) < 1: continue if<a>本文来源gao($daima.com搞@代@#码8网^</a> word: _in = re.sub("\s+", "", _in) # 转化为一整行,不含空格 _out = re.sub("\s+", "", _out) if len(_in) >= 1 and len(_out) >= 1: f_input.write(" ".join(list(_in)) + "\n") f_output.write(" ".join(list(_out)) + "\n") else: if len(_in) >= 1 and len(_out) >= 1: f_input.write(_in.strip() + "\n") f_output.write(_out.strip() + "\n") f_input.close() f_output.close()