HMM实现中文分词
发布日期:2021-07-01 02:13:10 浏览次数:2 分类:技术文章

本文共 5865 字,大约阅读时间需要 19 分钟。

import numpy as npimport warningsfrom hmmlearn.hmm import MultinomialHMM as mhmmdata=[{ u"我要吃饭":"SSBE"},{u"天气不错" : "BEBE"},{u"谢天谢地" : "BMME"}]def prints(s):    pass    print(s)def get_startprob():    """get BMES matrix       """    c=0    c_map={"B":0,"M":0,"E":0,"S":0}    #caculate the count    for v in data :        for key in v :            value=v[key]        c=c+1        prints("value[0] is "+value[0])        c_map[value[0]]=c_map[value[0]] +1        prints("c_map[value[0]] is "+str(c_map[value[0]]) )    res=[]    for i in "BMES":         res.append( c_map[i] / float(c))    return resdef get_transmat():    """get transmat of status    """    c=0    #record BE:1,BB:2    c_map={}    for v in data :        for key in v :            value=v[key]                prints("value[0] is "+value[0])        for v_i in range(len(value)-1):            couple=value[v_i:v_i+2]            c_couple_source = c_map.get(couple,0)            c_map[couple]=c_couple_source+1            c=c+1                 #c_map[value[0]]=c_map[value[0]] +1        #prints("c_map[value[0]] is "+str(c_map[value[0]]) )    prints("get_transmat's c_map is "+str(c_map))    res=[]    for i in "BMES":         col=[]         col_count=0         for j in "BMES":             col_count=c_map.get(i+j,0)+col_count                   for j in "BMES":                   col.append( c_map.get(i+j,0) / float(col_count))         res.append(col)    return resdef get_words():    return u"我要吃饭天气不错谢天地"def get_word_map():    words=get_words()    res={}    for i in range(len(words)):        res[words[i]]=i    return resdef get_array_from_phase(phase):    word_map=get_word_map()    res=[]    for key in phase:        res.append(word_map[key])    return resdef get_emissionprob():    #get emmissionprob of status and observers    c=0    #record Bc=0    #record B我:1,B吃:2    c_map={}    for v in data :              for key in v :            k=key            value=v[key]        prints("value[0] is "+value[0])        for v_i in range(len(value)):            couple=value[v_i]+k[v_i]            prints("emmition's couple is " + couple)            c_couple_source = c_map.get(couple,0)            c_map[couple]=c_couple_source+1            c=c+1    res=[]    prints("emmition's c_map is "+str(c_map))    words=get_words()    for i in "BMES":         col=[]         for j in words:             col.append( c_map.get(i+j,0) / float(c))         res.append(col)    return resif( __name__ == "__main__"):    # print("startprob is ",get_startprob())    # print("transmat is " ,get_transmat())    print("emissionprob is " , get_emissionprob())    print("word map is ",get_word_map())    # coding=utf-8    warnings.filterwarnings("ignore")    # import matplotlib.pyplot as plt    startprob = np.array(get_startprob())    print("startprob is ", startprob)    transmat = np.array(get_transmat())    print("transmat is ", transmat)    emissionprob = np.array(get_emissionprob())    print("emmissionprob is ", emissionprob)    mul_hmm = mhmm(n_components=4)    mul_hmm.startprob_ = startprob    mul_hmm.transmat_ = transmat    mul_hmm.emissionprob_ = emissionprob    phase = u"我要吃饭谢天谢地"    X = np.array(get_array_from_phase(phase))    X = X.reshape(len(phase), 1)    print("X is ", X)    Y = mul_hmm.predict(X)    print("Y is ", Y)    # {B(词开头),M(词中),E(词尾),S(独字词)} {0,1,2,3}

out

F:\anaconda\pythonw.exe D:/学习资料/网易云课堂/唐宇迪-机器学习课程(新)/自然语言处理(Python版)/第八章:HMM实战/HMM案例实战/HMM/get_hmm_param.pyvalue[0] is Semmition's couple is S我emmition's couple is S要emmition's couple is B吃emmition's couple is E饭value[0] is Bemmition's couple is B天emmition's couple is E气emmition's couple is B不emmition's couple is E错value[0] is Bemmition's couple is B谢emmition's couple is M天emmition's couple is M谢emmition's couple is E地emmition's c_map is {'S我': 1, 'S要': 1, 'B吃': 1, 'E饭': 1, 'B天': 1, 'E气': 1, 'B不': 1, 'E错': 1, 'B谢': 1, 'M天': 1, 'M谢': 1, 'E地': 1}emissionprob is  [[0.0, 0.0, 0.08333333333333333, 0.0, 0.08333333333333333, 0.0, 0.08333333333333333, 0.0, 0.08333333333333333, 0.08333333333333333, 0.0], [0.0, 0.0, 0.0, 0.0, 0.08333333333333333, 0.0, 0.0, 0.0, 0.08333333333333333, 0.08333333333333333, 0.0], [0.0, 0.0, 0.0, 0.08333333333333333, 0.0, 0.08333333333333333, 0.0, 0.08333333333333333, 0.0, 0.0, 0.08333333333333333], [0.08333333333333333, 0.08333333333333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]word map is  {'我': 0, '要': 1, '吃': 2, '饭': 3, '天': 9, '气': 5, '不': 6, '错': 7, '谢': 8, '地': 10}value[0] is Sc_map[value[0]] is 1value[0] is Bc_map[value[0]] is 1value[0] is Bc_map[value[0]] is 2startprob is  [0.66666667 0.         0.         0.33333333]value[0] is Svalue[0] is Bvalue[0] is Bget_transmat's c_map is {'SS': 1, 'SB': 1, 'BE': 3, 'EB': 1, 'BM': 1, 'MM': 1, 'ME': 1}transmat is  [[0.   0.25 0.75 0.  ] [0.   0.5  0.5  0.  ] [1.   0.   0.   0.  ] [0.5  0.   0.   0.5 ]]value[0] is Semmition's couple is S我emmition's couple is S要emmition's couple is B吃emmition's couple is E饭value[0] is Bemmition's couple is B天emmition's couple is E气emmition's couple is B不emmition's couple is E错value[0] is Bemmition's couple is B谢emmition's couple is M天emmition's couple is M谢emmition's couple is E地emmition's c_map is {'S我': 1, 'S要': 1, 'B吃': 1, 'E饭': 1, 'B天': 1, 'E气': 1, 'B不': 1, 'E错': 1, 'B谢': 1, 'M天': 1, 'M谢': 1, 'E地': 1}emmissionprob is  [[0.         0.         0.08333333 0.         0.08333333 0.  0.08333333 0.         0.08333333 0.08333333 0.        ] [0.         0.         0.         0.         0.08333333 0.  0.         0.         0.08333333 0.08333333 0.        ] [0.         0.         0.         0.08333333 0.         0.08333333  0.         0.08333333 0.         0.         0.08333333] [0.08333333 0.08333333 0.         0.         0.         0.  0.         0.         0.         0.         0.        ]]X is  [[ 0] [ 1] [ 2] [ 3] [ 8] [ 9] [ 8] [10]]Y is  [3 3 0 2 0 1 1 2]Process finished with exit code 0

转载地址:https://maoli.blog.csdn.net/article/details/89440323 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!

上一篇:回归:预测燃油效率
下一篇:HMM实践

发表评论

最新留言

很好
[***.229.124.182]2024年04月25日 10时40分24秒