言語処理100本ノック 2015をやってみた（第５章その３ 44） - "Diary" インターネットさんへの恩返し

言語処理100本ノック 2015の５章のつづき、44「係り受け木の可視化」です。

事前に以下でpydotを使えるようにしておきます。azwoo.hatenablog.com

プログラム

#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import re
import json
import CaboCha
import pydot

class Morph:
        def __init__(self,surface,base,pos,pos1):
                self.surface =    surface
                self.base    =    base
                self.pos     =    pos
                self.pos1    =    pos1

class Chunk:
        def __init__(self):
                self.morphs = []
                self.dst    = 0
                self.srcs   = []


if __name__ == "__main__":

        #一回CaboCha変換済みのファイルとして吐き出す。
        c = CaboCha.Parser()
        f = open('temp.txt','w')
        text = "私の友達の名前は山田太郎ですが、昔は和子という名前でした"
        tree = c.parse(text)
        f.write(tree.toString(CaboCha.FORMAT_LATTICE))
        f.close

        #CaboCha変換済みのファイルを読込む。
        f = open('temp.txt','r')
        tmp1 = []
        tmp2 = []
        temp1 = {}
        no_sentence = 0
        no_bunsetsu = 0
        temp1[no_sentence] = {}

        #2次元配列(行：文番号、列：単語番号)に形態素列クラスを入れる

        for i,line in enumerate(f):
                #1行名またはEOSの場合
                if (i==0) or ("EOS" == line[0:3]):
                        no_bunsetsu = 0

                #EOSの場合
                if "EOS" == line[0:3]:
                        #srcs代入
                        for i in temp1[no_sentence]:
                                if temp1[no_sentence][i].dst != "-1":
                                        temp1[no_sentence][int(temp1[no_sentence][i].dst)].srcs.append(i)

                        no_sentence += 1
                        temp1[no_sentence] = {}
                        #print "\n【文章番号 " + str(no_sentence) + "】"

                #文節情報
                elif "*" == line[0:1]:
                        #print "\n[文節番号 " + str(no_bunsetsu) + "]"
                        tmp1 = filter(lambda w: len(w) > 0, re.split(r' ', line))
                        #インスタンスを作って文節情報のみ格納する
                        temp1[no_sentence][no_bunsetsu] = Chunk()
                        #かかり先インデックス番号を格納
                        temp1[no_sentence][no_bunsetsu].dst = tmp1[2].replace("D","")
                        #かかり元インデックス番号を格納
                        #temp1[no_sentence][no_bunsetsu].srcs.append(getSrcIndexNo(temp1[no_sentence]))
                        #print "processing 文節\n"
                        no_bunsetsu += 1
                #単語
                else:
                        # "\t"か","でスプリット
                        tmp1 = filter(lambda w: len(w) > 0, re.split(r'\t|,', line))
                        #if tmp1[2]!="空白":
                        temp1[no_sentence][no_bunsetsu - 1].morphs.append(Morph(tmp1[0],tmp1[7],tmp1[1],tmp1[2]))


        #42. 係り元と係り先の文節の表示
        edges = []
        for a in temp1:                 #文章番号のループ
                for b in temp1[a]:      #文節番号のループ
                        val1=""
                        val2=""
                        if temp1[a][b].dst != "-1":
                                #かかり先インデックス番号
                                dst_index = temp1[a][b].dst
                                tmp_str = ""
                                #対象文節出力
                                for c in temp1[a][b].morphs:
                                        val1 += c.surface

                                tmp_str += "\t"
                                #かかり先文節出力
                                for c in temp1[a][int(dst_index)].morphs:
                                        val2 += c.surface

                                tmp_str = re.sub(r'、|。',"",tmp_str)
                                edges.append([val1, val2])


        g=pydot.graph_from_edges(edges)
        g.write_jpeg('graph_from_edges_dot.jpg', prog='dot')