"Diary" インターネットさんへの恩返し

いつもソースコードコピペばかりなので，みなさまへ少しばかりの恩返しを

言語処理100本ノック 2015をやってみた（第５章その２ 42,43）

自然言語処理言語処理100本ノック

スポンサーリンク

42. 係り元と係り先の文節の表示＆43. 名詞を含む文節が動詞を含む文節に係るものを抽出

【プログラム】

# coding: utf-8
import sys
import re
import json

f = open('neko.txt.mecab','r')

class Morph:
        def __init__(self,surface,base,pos,pos1):
                self.surface =    surface
                self.base    =    base
                self.pos     =    pos
                self.pos1    =    pos1

class Chunk:
        def __init__(self):
                self.morphs = []
                self.dst    = 0
                self.srcs   = []


if __name__ == "__main__":

        f = open('neko.txt.cabocha','r')
        tmp1 = []
        tmp2 = []
        temp1 = {}
        no_sentence = 0
        no_bunsetsu = 0
        temp1[no_sentence] = {}

        #2次元配列(行：文番号、列：単語番号)に形態素列クラスを入れる
        for i,line in enumerate(f):
                #1行名またはEOSの場合
                if (i==0) or ("EOS" == line[0:3]):
                        no_bunsetsu = 0

                #EOSの場合
                if "EOS" == line[0:3]:
                        #srcs代入
                        for i in temp1[no_sentence]:
                                if temp1[no_sentence][i].dst != "-1":
                                        temp1[no_sentence][int(temp1[no_sentence][i].dst)].srcs.append(i)


                        no_sentence += 1
                        temp1[no_sentence] = {}
 
                #文節情報
                elif "*" == line[0:1]:
                        #print "\n[文節番号 " + str(no_bunsetsu) + "]"
                        tmp1 = filter(lambda w: len(w) > 0, re.split(r' ', line))
                        #インスタンスを作って文節情報のみ格納する
                        temp1[no_sentence][no_bunsetsu] = Chunk()
                        #かかり先インデックス番号を格納
                        temp1[no_sentence][no_bunsetsu].dst = tmp1[2].replace("D","")
                        #かかり元インデックス番号を格納
                        #temp1[no_sentence][no_bunsetsu].srcs.append(getSrcIndexNo(temp1[no_sentence]))
                        #print "processing 文節\n"
                        no_bunsetsu += 1
                #単語
                else:
                        # "\t"か","でスプリット
                        tmp1 = filter(lambda w: len(w) > 0, re.split(r'\t|,', line))
                        #if tmp1[2]!="空白":
                        temp1[no_sentence][no_bunsetsu - 1].morphs.append(Morph(tmp1[0],tmp1[7],tmp1[1],tmp1[2]))


        #41. 係り受け解析結果の読み込み（文節・係り受け）
        text = ""
        for a in temp1[7]:
 

                text += "文節No " + str(a) + " 「"
                for b in temp1[7][a].morphs:
                        text +=  b.surface
                text += "」かかり先文節番号" + temp1[7][a].dst + "\n"

        #42. 係り元と係り先の文節の表示
        for a in temp1:                 #文章番号のループ
                for b in temp1[a]:      #文節番号のループ
                        if temp1[a][b].dst != "-1":
                                #かかり先インデックス番号
                                dst_index = temp1[a][b].dst
                                tmp_str = ""
                                #対象文節出力
                                for c in temp1[a][b].morphs:
                                        tmp_str += c.surface

                                tmp_str += "\t"
                                #かかり先文節出力
                                for c in temp1[a][int(dst_index)].morphs:
                                        tmp_str += c.surface

                                tmp_str = re.sub(r'、|。',"",tmp_str)
                                print tmp_str

        #43. 名詞を含む文節が動詞を含む文節に係るものを抽出
        for a in temp1:                 #文章番号のループ
                for b in temp1[a]:      #文節番号のループ
                        if temp1[a][b].dst != "-1":
                                #かかり先インデックス番号
                                dst_index = temp1[a][b].dst
                                tmp_str = ""
                                #対象文節出力
                                cnt_noun = 0
                                for c in temp1[a][b].morphs:
                                        if c.pos == "名詞":
                                                cnt_noun += 1
                                                tmp_str += c.surface + "(" + c.pos + ")"
                                        else:
                                                tmp_str += c.surface

                                if cnt_noun > 0:
                                        tmp_str += "\t"

                                        #かかり先文節出力
                                        cnt_verb = 0
                                        for c in temp1[a][int(dst_index)].morphs:
                                                if c.pos == '動詞':
                                                        cnt_verb += 1
                                                        tmp_str += c.surface +  "(" + c.pos + ")"
                                                else:
                                                        tmp_str += c.surface

                                                tmp_str = re.sub(r'、|。',"",tmp_str)

                                        if cnt_verb > 0:
                                                print tmp_str
                                        else:
                                                continue

【実行結果 42】

・
・
・
不可思議の      太平に
太平に  入る
吾輩は  死ぬ
死んで  得る
この    太平を
太平を  得る
太平は  得られぬ
死ななければ    得られぬ
ありがたい      ありがたい

【実行結果 43】

・
・
・
楽(名詞)そのもの(名詞)すらも    感じ(動詞)得(動詞)ない
日月(名詞)を    切り落し(動詞)
天地(名詞)を    粉韲し(動詞)て
粉(名詞)韲(名詞)して    入る(動詞)
太平(名詞)に    入る(動詞)
吾輩(名詞)は    死ぬ(動詞)
太平(名詞)を    得る(動詞)
太平(名詞)は    得(動詞)られ(動詞)ぬ