ナード戦隊データマン

データサイエンスを用いて悪と戦うぞ

英語word2vecを日本語word2vecに変換

マルチリンガルなWord Embeddingとは、複数の言語に対応するためのWord Embeddingの手法です。今回は、最も初歩的なモデルとして線形写像を使います。

概要

線形写像を用いたMultilingual Word Embeddingは、ソース言語のベクトルから、ターゲット言語のベクトルへの写像を学習することを意味します。

ソース言語とターゲット言語のベクトルを対応させる教師データは、以下の形式になっています:

reds レッズ
posters ポスター
expelled    退学
strait  海峡
communicate コミュニケーション
alphabet    アルファベット
chef    シェフ
prizes  懸賞
prizes  賞品
eighteen    十八
shirt   シャツ

この対応関係を定義したものを、"parallel vocabulary"といいます。

擬似言語的に書けば、以下のような方法で予測を行うためのmodel.predict関数を学習したいわけです。

# ja_kv: 日本語のword2vec
# en_kv: 英語のword2vec
# model: 英語から日本語へベクトル変換するモデル
# src_word: 英単語
ja_kv.most_similar(
    model.predict(en_kv[src_word])
)

このようなモデルを学習するための、精度の高い手法としては、facebookresearch/MUSE1 がありますが、もっと基本的な理解をするために、今回は線形写像によるモデルを試します。

事前準備

以下から、fasttextの日本語モデルと英語モデルをダウンロード。 https://fasttext.cc/docs/en/pretrained-vectors.html

以下から訓練データ、テストデータをダウンロード。 https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ja.0-5000.txt https://dl.fbaipublicfiles.com/arrival/dictionaries/en-ja.5000-6500.txt

訓練のコード

from keras.models import Model
from keras.layers import Dense, Dropout, Input, concatenate
from keras.callbacks import ModelCheckpoint
from keras import optimizers
from sklearn.utils import shuffle
import io
import numpy as np


def load_vec(emb_path, nmax=500000000000000000):
    vectors = []
    word2id = {}
    with io.open(emb_path,
                 'r',
                 encoding='utf-8',
                 newline='\n',
                 errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
        id2word = {v: k for k, v in word2id.items()}
        embeddings = np.vstack(vectors)
        return embeddings, id2word, word2id


def build_model(dim=300):
    inp = Input(shape=(dim,))
    x = Dense(1024, activation="linear")(inp)
    x = Dropout(0.5)(x)
    x = Dense(1024, activation="linear")(x)
    x = concatenate([x, inp])
    x = Dropout(0.5)(x)
    out = Dense(dim, activation="linear")(x)
    model = Model(inp, out)
    opt = optimizers.Adam(lr=0.001, clipvalue=0.5)
    model.compile(loss="mse", optimizer=opt, metrics=["mse"])
    return model

def generate_data(train, en_embeddings, en_word2id, ja_embeddings, ja_word2id):
    X = []
    y = []
    while True:
        for x_tmp, y_tmp in train:
            try:
                tmp = [
                    en_embeddings[en_word2id[x_tmp]],
                    ja_embeddings[ja_word2id[y_tmp]]
                ]
            except Exception as e:
                print(repr(e))
                continue
            X.append(tmp[0])
            y.append(tmp[1])
            if len(X) > 1000:
                assert len(X) == len(y)
                yield np.array(X), np.array(y)
                X = []
                y = []


if __name__ == "__main__":
    print("loading")
    en_emb, _, en_w2id = load_vec("../data/wiki.en.vec")
    ja_emb, _, ja_w2id = load_vec("../data/wiki.ja.vec")
    print("done loading")

    print("loading data")
    with open("../data/jadata/en-ja.0-5000.txt") as f:
        data = []
        for line in f:
            data.append(line.strip().split("\t"))
        data = shuffle(data)
    print("data loading done")

    print("split data")
    size = int(len(data)*0.8)
    train, val = data[:size], data[size:]
    print("done splitting")

    print("training...")
    model = build_model()
    callbacks = [
        ModelCheckpoint("model.h5",
                        save_best_only=False,
                        monitor="val_loss",
                        mode="min"),
        ModelCheckpoint("model_best.h5",
                        save_best_only=True,
                        monitor="val_loss",
                        mode="min")
    ]
    model.fit_generator(
        generate_data(train, en_emb, en_w2id, ja_emb, ja_w2id),
        validation_data=generate_data(val, en_emb, en_w2id, ja_emb, ja_w2id),
        steps_per_epoch=1000,
        validation_steps=1,
        epochs=1000,
        callbacks=callbacks
    )
    print("done training")

テスト

In[1]:

from gensim.models import KeyedVectors
enmodel = KeyedVectors.load_word2vec_format("./data/wiki.en.vec")
jamodel = KeyedVectors.load_word2vec_format("./data/wiki.ja.vec")

In[2]:

with open("data/crosslingual/dictionaries/en-ja.5000-6500.txt") as f:
    test_data = [line.strip().split() for line in f]

In[3]:

def p_at_k(line, model, enmodel, jamodel, k=10):
    mid_vec = model.predict(np.array([enmodel[line[0]]]))
    heystack = [x[0] for x in jamodel.most_similar(mid_vec, topn=k)]
    needle = line[1]
    try:
        y = float(heystack.index(needle))/float(k)
    except ValueError:
        y = 0.0
    return y

precisions = [p_at_k(line, model, enmodel, jamodel, k=10) for line in test_data]
print(np.mean(precisions))

Out[3]:

0.021623123957754307

In[4]:

import pprint
src = ["obama", "murder", "mickey", "book", "god", "os", "driver"]
for src_word in src:
    mid_vec = model.predict(np.array([enmodel[src_word]]))
    print(src_word)
    pprint.pprint(jamodel.most_similar(mid_vec))
    print("")

Out[4]:

obama
[('バマ', 0.9984610676765442),
 ('ベギー', 0.9984151721000671),
 ('米紙', 0.9983594417572021),
 ('発言', 0.9983515739440918),
 ('米国', 0.9983422160148621),
 ('ネディ', 0.9983240365982056),
 ('ケシー', 0.998323917388916),
 ('我々', 0.9982770085334778),
 ('エハン', 0.998260498046875),
 ('文民', 0.9982603788375854)]

murder
[('犯', 0.9995188117027283),
 ('殺', 0.9995153546333313),
 ('罪', 0.9994887113571167),
 ('強談', 0.9994596242904663),
 ('死', 0.9994287490844727),
 ('男囚', 0.9993982315063477),
 ('醜行', 0.9993961453437805),
 ('騙', 0.999383807182312),
 ('急告', 0.9993802905082703),
 ('妻', 0.9993776082992554)]

mickey
[('パッジ', 0.9990284442901611),
 ('ポビー', 0.9989916682243347),
 ('ゲビー', 0.9989231824874878),
 ('ゴーツ', 0.9989019632339478),
 ('マベル', 0.998875617980957),
 ('ケシー', 0.9988710880279541),
 ('ノニー', 0.9988632202148438),
 ('コッズ', 0.9988256692886353),
 ('ソギー', 0.998824954032898),
 ('ケザー', 0.9988231658935547)]

book
[('書', 0.9995638728141785),
 ('題意', 0.9995237588882446),
 ('冊', 0.9995195865631104),
 ('自注', 0.9995168447494507),
 ('私', 0.9995006918907166),
 ('彼', 0.9994931817054749),
 ('著', 0.9994920492172241),
 ('述', 0.9994770884513855),
 ('小編', 0.9994733333587646),
 ('小冊', 0.9994729161262512)]

god
[('定命', 0.9994326233863831),
 ('神', 0.9994012117385864),
 ('彼', 0.9993762373924255),
 ('意想', 0.9993680715560913),
 ('空身', 0.9993419647216797),
 ('壮者', 0.9993240237236023),
 ('ヘズ', 0.9993231892585754),
 ('邪説', 0.999315083026886),
 ('全智', 0.9993149042129517),
 ('神゜', 0.9993104934692383)]

os
[('os', 0.9990252256393433),
 ('実装', 0.9986990094184875),
 ('pc', 0.9984074831008911),
 ('動作', 0.9983968138694763),
 ('hax', 0.9983946084976196),
 ('起動', 0.9983715415000916),
 ('eβc', 0.9983643889427185),
 ('osx', 0.9983490109443665),
 ('βc', 0.9983416795730591),
 ('入力', 0.9983031749725342)]

driver
[('開車', 0.9989429712295532),
 ('左用', 0.9988387823104858),
 ('車', 0.9988120198249817),
 ('一台', 0.998807966709137),
 ('速配', 0.9987748861312866),
 ('走回', 0.9987524747848511),
 ('出格', 0.9987507462501526),
 ('男用', 0.9987356066703796),
 ('も', 0.9987247586250305),
 ('競る', 0.9987195730209351)]

考察

precisionはそれほど高くはないですが、いくつかの語に対する目視確認をしてみると、概ね英単語に近い日本語が予測されています。

predict関数が意味するものは、「英語ベクトルを、日本語のベクトル空間内に写像する」ということなので、同一空間内に英語ベクトルと日本語ベクトルを(変換によって)置くことができます。

facebookresearch/MUSE[^1]以外の、過去のMultilingual Word Embeddingの研究は、概ねこの線形写像の方法を改善する方向で行われてきているようです。

精度の評価には、単語翻訳におけるP@kを使っていますが、MUSEが行っているように、複数の精度評価を組み合わせて、その中でベストのものを選ぶようなやり方のほうが、よりよいモデルを選択できる可能性があります。

詳細については、WORD TRANSLATION WITHOUT PARALLEL DATA2という論文が詳しいようです。(私もまだよく理解していないです。)

エンティティ名と定義文からタイプ予測

keras-self-attention1 は、self-attentionを簡単に使うためのライブラリです。今回は、このライブラリを使うことによって、エンティティタイプ予測の問題をより汎用的な特徴量から学習させられることを確かめます。

事前準備

pip install keras-self-attention

訓練部分のコード

import json
import sqlite3
from random import choice
 
import numpy as np
import sentencepiece as spm
from keras import optimizers
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Input, Embedding, Dropout, concatenate, Flatten
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras_self_attention import SeqSelfAttention
 
sp = spm.SentencePieceProcessor()
sp.load("./spmodel/en.wiki.bpe.vs10000.model")
 
 
def build_model(num_labels,
                maxlen1=128,
                maxlen2=128,
                max_features=10001,
                dim1=50,
                dim2=50):
    inputs = Input(shape=(maxlen1, ))
    inputs2 = Input(shape=(maxlen2, ))
    emb1 = Embedding(max_features, dim1, input_length=maxlen1)(inputs)
    emb2 = Embedding(max_features, dim2, input_length=maxlen2)(inputs2)
    tmp = concatenate([emb1, emb2])
    att = SeqSelfAttention(attention_activation='sigmoid')(tmp)
    tmp2 = Flatten()(emb1)
    tmp3 = Flatten()(att)
    tmp = concatenate([tmp2, tmp3])
    tmp = Dense(1024, activation="linear")(tmp)
    out = Dropout(0.5)(tmp)
    output = Dense(units=num_labels, activation='softmax')(out)
    model = Model([inputs, inputs2], output)
    opt = optimizers.Adam(lr=0.001, clipvalue=0.5)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['acc'])
    return model
 
 
def preprocessing(texts, sp=sp, maxlen=50):
    return pad_sequences([sp.EncodeAsIds(text) for text in texts],
                         maxlen=maxlen)
 
 
def generate_data(data, labels, batch_size=1000):
    X0 = []
    X1 = []
    y = []
    while True:
        for i, d in enumerate(data):
            X0.append(d["entity"].replace("_", " "))
            X1.append(d["definition"])
            label = np.zeros(len(labels.keys()))
            label[int(labels[d["type"]])] = 1.0
            y.append(label)
            if len(X0) > 1000:
                yield [preprocessing(X0, maxlen=128), preprocessing(X1, maxlen=128)], np.array(y)
                X0 = []
                X1 = []
                y = []
 
def generate_test_data(test, labels):
    X0 = []
    X1 = []
    y = []
    for i, d in enumerate(test):
        X0.append(d["entity"].replace("_", " "))
        X1.append(d["definition"])
        y.append(int(labels[d["type"]]))
    return [preprocessing(X0, maxlen=128), preprocessing(X1, maxlen=128)], np.array(y)
 
 
if __name__ == "__main__":
    from sklearn.utils import shuffle
    from sklearn.metrics import classification_report
    with open("./labels.json") as f:
        key2i = json.load(f)[1]
 
    with open("./wiki_definition.json") as f:
        data = shuffle(json.load(f))
 
    size = int(len(data) * 0.8)
    train, test = data[:size], data[size:]
    size = int(len(test) * 0.8)
    test, val = test[:size], test[size:]
 
    model = build_model(len(key2i.keys()))
    callbacks = [
        ModelCheckpoint("model.h5",
                        save_best_only=False,
                        monitor="val_loss",
                        mode="min"),
        ModelCheckpoint("model_best.h5",
                        save_best_only=True,
                        monitor="val_loss",
                        mode="min")
    ]
    model.fit_generator(generate_data(train, key2i),
                        validation_data=generate_data(val, key2i),
                        validation_steps=10,
                        steps_per_epoch=1000,
                        epochs=3,
                        verbose=1,
                        callbacks=callbacks)
 
    X_test, y_test = generate_test_data(test, key2i)
 
    y_pred = [np.argmax(x) for x in model.predict(X_test)]
 
    with open("eval.txt", "w") as f:
        f.write(classification_report(y_test, y_pred))

Note: labels.jsonには、ラベル名とラベルIDの対応が入っています。

Note: wiki_definition.jsonには、定義文、エンティティ名, エンティティタイプが格納されています。

精度

スマホなど、ブラウザの横幅が狭い場合は、以下の精度を横にスクロールさせて見てください。

                                             precision recall    f1-score  support   

AcademicConference                           0.71      0.33      0.45      15        
AcademicJournal                              0.92      0.95      0.94      1075      
Actor                                        0.49      0.16      0.24      630       
AdministrativeRegion                         0.84      0.81      0.82      3307      
AdultActor                                   0.76      0.61      0.68      171       
Aircraft                                     0.92      0.96      0.94      1726      
Airline                                      0.96      0.95      0.96      582       
Airport                                      0.97      0.94      0.96      2177      
Album                                        0.96      0.96      0.96      19501     
AmateurBoxer                                 0.29      0.08      0.12      75        
Ambassador                                   0.28      0.36      0.31      83        
AmericanFootballLeague                       0.00      0.00      0.00      10        
AmericanFootballPlayer                       0.80      0.90      0.85      3070      
AmericanFootballTeam                         0.00      0.00      0.00      5         
Amphibian                                    0.91      0.95      0.93      563       
AmusementParkAttraction                      0.76      0.55      0.64      64        
AnatomicalStructure                          0.56      0.64      0.60      268       
Animal                                       0.73      0.83      0.77      1096      
AnimangaCharacter                            0.50      0.05      0.08      22        
Anime                                        0.41      0.10      0.16      161       
Arachnid                                     0.94      0.84      0.89      632       
Archaea                                      0.71      0.56      0.63      27        
Architect                                    0.58      0.34      0.43      457       
ArchitecturalStructure                       0.33      0.05      0.08      43        
Artery                                       1.00      0.83      0.91      52        
ArtificialSatellite                          0.89      0.94      0.91      371       
Artist                                       0.47      0.38      0.42      2267      
ArtistDiscography                            0.98      0.99      0.98      560       
Artwork                                      0.82      0.81      0.82      760       
Asteroid                                     0.00      0.00      0.00      2         
Astronaut                                    0.77      0.79      0.78      104       
Athlete                                      0.81      0.86      0.84      5763      
AustralianFootballLeague                     0.00      0.00      0.00      1         
AustralianFootballTeam                       0.72      0.88      0.79      72        
AustralianRulesFootballPlayer                0.98      0.99      0.98      1959      
AutoRacingLeague                             0.00      0.00      0.00      1         
Automobile                                   0.92      0.86      0.89      887       
AutomobileEngine                             0.87      0.76      0.81      54        
Award                                        0.82      0.93      0.87      788       
Bacteria                                     0.45      0.37      0.41      145       
BadmintonPlayer                              0.96      0.88      0.92      241       
Band                                         0.86      0.79      0.83      5259      
Bank                                         0.72      0.41      0.52      425       
Baronet                                      0.68      0.81      0.74      118       
BaseballLeague                               0.58      0.88      0.70      48        
BaseballPlayer                               0.96      0.98      0.97      3303      
BaseballSeason                               0.96      0.42      0.58      53        
BaseballTeam                                 0.00      0.00      0.00      2         
BasketballLeague                             0.88      0.62      0.73      69        
BasketballPlayer                             0.92      0.94      0.93      1930      
BasketballTeam                               0.87      0.82      0.85      271       
Bay                                          0.00      0.00      0.00      1         
BeachVolleyballPlayer                        0.55      0.43      0.48      28        
BeautyQueen                                  0.87      0.76      0.81      339       
Beverage                                     0.79      0.56      0.66      130       
BiologicalDatabase                           0.66      0.67      0.67      46        
Bird                                         0.98      0.92      0.95      2012      
BodyOfWater                                  0.68      0.45      0.54      195       
Bodybuilder                                  0.90      0.43      0.58      44        
Bone                                         0.65      0.47      0.55      55        
Book                                         0.85      0.90      0.87      5439      
Boxer                                        0.85      0.86      0.85      642       
Brain                                        0.78      0.77      0.77      82        
Brewery                                      0.18      0.07      0.10      43        
Bridge                                       0.85      0.92      0.88      694       
BroadcastNetwork                             0.34      0.28      0.31      197       
Building                                     0.67      0.78      0.72      7466      
BusCompany                                   0.87      0.74      0.80      223       
BusinessPerson                               0.00      0.00      0.00      109       
CanadianFootballLeague                       0.00      0.00      0.00      1         
CanadianFootballTeam                         0.56      0.29      0.38      31        
Canal                                        0.84      0.65      0.73      63        
Canoeist                                     0.90      0.89      0.89      70        
Cardinal                                     0.57      0.35      0.43      117       
Castle                                       0.60      0.72      0.65      232       
Cave                                         0.87      0.67      0.76      89        
Chancellor                                   0.00      0.00      0.00      16        
Cheese                                       0.73      0.71      0.72      42        
Chef                                         0.61      0.50      0.55      90        
ChemicalCompound                             0.86      0.90      0.88      1550      
ChessPlayer                                  0.85      0.80      0.82      226       
ChristianBishop                              0.75      0.77      0.76      1413      
City                                         0.70      0.76      0.73      3115      
ClassicalMusicArtist                         0.00      0.00      0.00      60        
ClassicalMusicComposition                    0.55      0.36      0.44      99        
Cleric                                       0.71      0.48      0.57      414       
ClubMoss                                     0.00      0.00      0.00      17        
College                                      0.86      0.86      0.86      14        
CollegeCoach                                 0.83      0.82      0.83      1053      
Colour                                       0.75      0.63      0.69      38        
Comedian                                     0.49      0.23      0.32      194       
ComedyGroup                                  0.00      0.00      0.00      7         
Comic                                        0.75      0.79      0.77      344       
ComicStrip                                   0.88      0.46      0.61      65        
ComicsCharacter                              0.89      0.84      0.87      620       
ComicsCreator                                0.71      0.55      0.62      426       
Company                                      0.76      0.84      0.80      7920      
ConcentrationCamp                            0.33      0.08      0.13      12        
Congressman                                  0.42      0.47      0.45      493       
Conifer                                      0.86      0.75      0.80      110       
Constellation                                0.90      0.60      0.72      15        
Continent                                    0.00      0.00      0.00      5         
Convention                                   0.58      0.59      0.59      327       
Country                                      0.70      0.52      0.60      475       
Crater                                       0.91      0.81      0.86      103       
CricketGround                                0.90      0.70      0.79      37        
CricketLeague                                0.00      0.00      0.00      1         
CricketTeam                                  0.91      0.83      0.87      113       
Cricketer                                    0.97      0.97      0.97      2926      
Criminal                                     0.66      0.40      0.50      380       
Crustacean                                   0.87      0.83      0.85      433       
CultivatedVariety                            0.97      0.91      0.94      242       
Curler                                       0.90      0.93      0.91      137       
Currency                                     0.92      0.77      0.84      57        
Cycad                                        0.83      0.54      0.65      28        
CyclingRace                                  0.85      0.94      0.89      115       
CyclingTeam                                  0.97      0.74      0.84      47        
Cyclist                                      0.94      0.96      0.95      1745      
Dam                                          0.91      0.86      0.89      516       
DartsPlayer                                  0.97      0.86      0.91      101       
Device                                       0.91      0.83      0.87      196       
Diocese                                      0.98      0.97      0.97      526       
Disease                                      0.95      0.82      0.88      882       
Drug                                         0.88      0.79      0.84      936       
Earthquake                                   0.99      0.98      0.98      129       
Economist                                    0.46      0.20      0.28      188       
EducationalInstitution                       0.84      0.41      0.55      75        
Election                                     1.00      0.71      0.83      17        
Embryology                                   0.39      0.30      0.34      30        
Engineer                                     0.46      0.21      0.29      115       
Entomologist                                 0.56      0.26      0.36      73        
Enzyme                                       0.96      0.96      0.96      786       
EthnicGroup                                  0.87      0.79      0.83      773       
Eukaryote                                    0.62      0.42      0.50      208       
EurovisionSongContestEntry                   0.84      0.86      0.85      179       
Event                                        0.75      0.69      0.72      655       
Fashion                                      0.33      0.02      0.04      50        
FashionDesigner                              0.66      0.35      0.46      130       
Fern                                         0.82      0.80      0.81      165       
FictionalCharacter                           0.77      0.46      0.57      640       
FieldHockeyLeague                            0.00      0.00      0.00      5         
FigureSkater                                 0.97      0.94      0.95      528       
Film                                         0.92      0.93      0.93      16836     
FilmFestival                                 0.91      0.86      0.88      143       
Fish                                         0.95      0.93      0.94      2870      
FloweringPlant                               0.00      0.00      0.00      26        
Food                                         0.89      0.85      0.87      744       
FootballLeagueSeason                         0.93      0.96      0.95      1501      
FootballMatch                                0.96      0.89      0.92      486       
FormulaOneRacer                              0.69      0.47      0.56      120       
FormulaOneTeam                               0.92      0.58      0.71      19        
Fungus                                       0.94      0.96      0.95      1742      
GaelicGamesPlayer                            0.95      0.96      0.96      568       
Galaxy                                       0.94      0.95      0.95      157       
Game                                         0.82      0.67      0.74      228       
Garden                                       0.81      0.42      0.55      50        
Ginkgo                                       0.00      0.00      0.00      2         
GivenName                                    0.90      0.90      0.90      621       
Glacier                                      0.92      0.83      0.87      119       
Gnetophytes                                  0.00      0.00      0.00      3         
GolfCourse                                   0.86      0.74      0.80      50        
GolfLeague                                   0.00      0.00      0.00      3         
GolfPlayer                                   0.97      0.94      0.96      538       
GolfTournament                               0.93      0.97      0.95      269       
GovernmentAgency                             0.72      0.78      0.75      879       
Governor                                     0.43      0.18      0.26      420       
GrandPrix                                    0.97      0.98      0.98      229       
Grape                                        0.93      0.83      0.88      69        
GreenAlga                                    0.48      0.75      0.58      67        
GridironFootballPlayer                       0.69      0.45      0.54      1090      
Guitarist                                    0.00      0.00      0.00      23        
Gymnast                                      0.91      0.80      0.85      333       
HandballLeague                               0.00      0.00      0.00      3         
HandballPlayer                               0.94      0.96      0.95      366       
HandballTeam                                 0.90      0.78      0.84      60        
Historian                                    0.40      0.03      0.06      124       
HistoricBuilding                             0.79      0.81      0.80      1373      
HistoricPlace                                0.66      0.42      0.51      3697      
HockeyTeam                                   0.93      0.90      0.92      374       
Holiday                                      0.75      0.59      0.66      152       
HollywoodCartoon                             0.82      0.83      0.83      241       
HorseRace                                    0.97      0.98      0.98      378       
HorseRider                                   0.76      0.69      0.72      87        
HorseTrainer                                 0.83      0.39      0.54      38        
Hospital                                     0.84      0.90      0.87      474       
Hotel                                        0.68      0.49      0.57      196       
IceHockeyLeague                              0.77      0.82      0.79      44        
IceHockeyPlayer                              0.95      0.97      0.96      2344      
InformationAppliance                         0.80      0.45      0.57      200       
InlineHockeyLeague                           0.00      0.00      0.00      1         
Insect                                       0.98      0.99      0.98      21033     
Island                                       0.76      0.81      0.78      929       
Jockey                                       0.73      0.74      0.73      91        
Journalist                                   0.30      0.12      0.17      230       
Judge                                        0.52      0.30      0.38      430       
LacrosseLeague                               1.00      0.50      0.67      8         
LacrossePlayer                               0.77      0.84      0.81      69        
Lake                                         0.91      0.90      0.91      1516      
Language                                     0.97      0.94      0.95      1191      
LaunchPad                                    1.00      0.86      0.92      14        
LawFirm                                      0.87      0.49      0.62      68        
Legislature                                  0.79      0.85      0.82      287       
Library                                      0.65      0.82      0.73      158       
Ligament                                     0.96      0.79      0.87      29        
Lighthouse                                   0.95      0.87      0.91      287       
Locomotive                                   0.87      0.95      0.91      515       
Lymph                                        0.83      1.00      0.91      10        
Magazine                                     0.90      0.67      0.77      809       
Mammal                                       0.89      0.89      0.89      1260      
Manga                                        0.73      0.55      0.63      491       
MartialArtist                                0.92      0.70      0.79      513       
Mayor                                        0.42      0.41      0.41      266       
Medician                                     0.17      0.01      0.02      76        
MemberOfParliament                           0.52      0.61      0.56      1191      
MilitaryConflict                             0.92      0.92      0.92      2103      
MilitaryPerson                               0.84      0.79      0.81      4300      
MilitaryStructure                            0.64      0.52      0.57      665       
MilitaryUnit                                 0.94      0.93      0.94      2694      
Mineral                                      0.94      0.93      0.93      224       
MixedMartialArtsEvent                        0.95      0.91      0.93      116       
Model                                        0.62      0.46      0.53      252       
Mollusca                                     0.98      0.98      0.98      4271      
Monarch                                      0.49      0.36      0.42      367       
Monument                                     0.56      0.22      0.32      86        
Moss                                         0.85      0.64      0.73      78        
Motorcycle                                   0.84      0.86      0.85      153       
MotorcycleRacingLeague                       0.00      0.00      0.00      4         
MotorcycleRider                              0.86      0.93      0.89      201       
MotorsportSeason                             0.94      0.93      0.94      503       
Mountain                                     0.91      0.88      0.89      2631      
MountainPass                                 0.93      0.88      0.90      160       
MountainRange                                0.91      0.78      0.84      385       
Murderer                                     0.00      0.00      0.00      17        
Muscle                                       0.92      0.77      0.84      44        
Museum                                       0.83      0.68      0.75      874       
MusicFestival                                0.73      0.34      0.47      64        
MusicGenre                                   0.83      0.49      0.62      172       
Musical                                      0.76      0.83      0.79      209       
MusicalArtist                                0.70      0.73      0.71      7874      
MusicalWork                                  0.50      0.14      0.22      43        
MythologicalFigure                           0.73      0.57      0.64      109       
NCAATeamSeason                               1.00      1.00      1.00      2668      
NascarDriver                                 0.64      0.84      0.72      146       
NationalCollegiateAthleticAssociationAthlete 0.00      0.00      0.00      23        
NationalFootballLeagueSeason                 0.96      0.97      0.96      560       
Nerve                                        0.87      0.86      0.86      56        
NetballPlayer                                0.91      0.69      0.78      42        
Newspaper                                    0.85      0.87      0.86      1019      
Noble                                        0.60      0.48      0.53      799       
Non-ProfitOrganisation                       0.30      0.01      0.02      303       
Novel                                        0.00      0.00      0.00      4         
OfficeHolder                                 0.61      0.70      0.65      10451     
OlympicEvent                                 0.98      0.97      0.98      726       
OlympicResult                                0.80      0.99      0.89      113       
Olympics                                     0.75      1.00      0.86      6         
Organisation                                 0.67      0.60      0.63      2759      
Painter                                      0.68      0.35      0.46      402       
Park                                         0.64      0.48      0.55      608       
Person                                       0.59      0.67      0.63      28250     
Philosopher                                  0.41      0.17      0.24      286       
Photographer                                 0.75      0.04      0.08      73        
Place                                        0.68      0.68      0.68      1071      
Planet                                       0.98      0.95      0.96      537       
Plant                                        0.97      0.96      0.96      8292      
Play                                         0.75      0.62      0.68      288       
PlayboyPlaymate                              0.82      0.73      0.78      45        
Poem                                         0.57      0.53      0.55      57        
Poet                                         0.34      0.21      0.26      56        
PokerPlayer                                  0.96      0.94      0.95      130       
PoliticalParty                               0.93      0.84      0.88      1178      
Politician                                   0.75      0.25      0.38      2808      
PoloLeague                                   0.00      0.00      0.00      5         
Pope                                         0.94      0.88      0.91      68        
PowerStation                                 0.83      0.95      0.89      326       
Presenter                                    0.00      0.00      0.00      16        
President                                    0.54      0.14      0.22      321       
PrimeMinister                                0.33      0.06      0.11      203       
Prison                                       0.89      0.89      0.89      199       
ProgrammingLanguage                          0.73      0.55      0.63      148       
ProtectedArea                                0.80      0.85      0.82      1404      
Protein                                      0.81      0.75      0.78      356       
PublicTransitSystem                          0.49      0.68      0.57      252       
Publisher                                    0.81      0.43      0.56      250       
RaceHorse                                    0.97      0.97      0.97      627       
Racecourse                                   0.81      0.57      0.67      51        
RacingDriver                                 0.74      0.73      0.74      388       
RadioHost                                    0.17      0.32      0.22      56        
RadioProgram                                 0.71      0.62      0.66      189       
RadioStation                                 0.96      0.97      0.96      3086      
RailwayLine                                  0.90      0.77      0.83      526       
RailwayStation                               0.93      0.60      0.73      235       
RailwayTunnel                                0.75      0.75      0.75      32        
RecordLabel                                  0.88      0.85      0.86      506       
Religious                                    0.43      0.14      0.21      169       
ReligiousBuilding                            0.59      0.67      0.62      675       
Reptile                                      0.90      0.90      0.90      742       
ResearchProject                              0.00      0.00      0.00      2         
Restaurant                                   0.79      0.63      0.70      175       
River                                        0.96      0.98      0.97      4331      
Road                                         0.95      0.98      0.96      3216      
RoadJunction                                 0.86      0.64      0.73      28        
RoadTunnel                                   0.65      0.84      0.73      43        
Rocket                                       0.78      0.45      0.57      40        
RollerCoaster                                0.87      0.86      0.87      109       
Rower                                        0.75      0.28      0.41      43        
Royalty                                      0.71      0.67      0.69      1580      
RugbyClub                                    0.85      0.89      0.87      366       
RugbyLeague                                  0.77      0.65      0.70      62        
RugbyPlayer                                  0.95      0.97      0.96      2539      
Saint                                        0.75      0.72      0.73      602       
School                                       0.91      0.96      0.93      4936      
Scientist                                    0.51      0.53      0.52      3674      
ScreenWriter                                 0.36      0.03      0.06      118       
Sea                                          0.50      0.14      0.22      7         
Senator                                      0.21      0.16      0.18      115       
Settlement                                   0.88      0.91      0.90      37384     
Ship                                         0.96      0.97      0.97      4584      
ShoppingMall                                 0.86      0.89      0.88      381       
Single                                       0.88      0.93      0.90      7687      
SiteOfSpecialScientificInterest              0.79      0.88      0.83      170       
Skater                                       0.65      0.80      0.72      80        
SkiArea                                      0.85      0.70      0.77      92        
Skier                                        0.77      0.84      0.81      413       
Skyscraper                                   0.00      0.00      0.00      1         
SnookerChamp                                 0.00      0.00      0.00      4         
SnookerPlayer                                0.83      0.95      0.89      61        
SoapCharacter                                0.85      0.84      0.85      418       
SoccerClub                                   0.97      0.94      0.95      3233      
SoccerClubSeason                             0.94      0.99      0.96      1460      
SoccerLeague                                 0.81      0.77      0.79      269       
SoccerManager                                0.82      0.64      0.72      2965      
SoccerPlayer                                 0.93      0.97      0.95      17824     
SoccerTournament                             0.84      0.92      0.88      1079      
SoftballLeague                               0.00      0.00      0.00      2         
Software                                     0.78      0.84      0.81      1716      
SolarEclipse                                 0.98      0.98      0.98      61        
Song                                         0.56      0.29      0.38      941       
SpaceShuttle                                 0.00      0.00      0.00      2         
SpaceStation                                 0.00      0.00      0.00      9         
Species                                      0.79      0.81      0.80      940       
SpeedwayLeague                               0.00      0.00      0.00      3         
SpeedwayRider                                0.95      0.97      0.96      108       
SpeedwayTeam                                 1.00      0.67      0.80      9         
Sport                                        0.38      0.25      0.30      40        
SportsEvent                                  0.84      0.80      0.82      172       
SportsLeague                                 0.31      0.28      0.29      65        
SportsTeam                                   0.63      0.69      0.66      377       
SportsTeamMember                             0.93      0.59      0.72      90        
SquashPlayer                                 0.97      0.94      0.95      65        
Stadium                                      0.62      0.30      0.40      932       
Star                                         0.96      0.96      0.96      469       
Station                                      0.95      0.99      0.97      4268      
Stream                                       0.00      0.00      0.00      7         
SumoWrestler                                 0.97      0.87      0.92      76        
SupremeCourtOfTheUnitedStatesCase            0.97      0.97      0.97      437       
Surname                                      0.14      0.01      0.02      74        
Swimmer                                      0.88      0.82      0.85      919       
TableTennisPlayer                            0.82      0.68      0.74      75        
TelevisionEpisode                            0.93      0.92      0.92      1277      
TelevisionHost                               0.00      0.00      0.00      10        
TelevisionSeason                             0.94      0.88      0.91      569       
TelevisionShow                               0.86      0.83      0.84      5644      
TelevisionStation                            0.91      0.84      0.87      1163      
TennisLeague                                 0.00      0.00      0.00      1         
TennisPlayer                                 0.92      0.97      0.94      810       
TennisTournament                             0.92      0.94      0.93      188       
Theatre                                      0.30      0.24      0.27      115       
Town                                         0.83      0.71      0.76      6528      
TradeUnion                                   0.89      0.73      0.81      248       
Train                                        0.91      0.76      0.83      237       
Tunnel                                       0.00      0.00      0.00      15        
University                                   0.88      0.86      0.87      2900      
Valley                                       0.14      0.07      0.09      15        
Vein                                         0.92      0.94      0.93      35        
Venue                                        0.41      0.67      0.51      889       
VideoGame                                    0.90      0.93      0.92      3038      
Village                                      0.96      0.95      0.95      26308     
VoiceActor                                   0.00      0.00      0.00      28        
Volcano                                      0.68      0.51      0.58      106       
VolleyballCoach                              0.08      0.12      0.10      8         
VolleyballLeague                             0.71      0.38      0.50      13        
VolleyballPlayer                             0.94      0.89      0.91      601       
WaterRide                                    0.00      0.00      0.00      10        
WaterwayTunnel                               1.00      0.50      0.67      2         
Weapon                                       0.90      0.78      0.84      814       
Website                                      0.73      0.29      0.42      548       
WineRegion                                   0.94      0.77      0.85      65        
Winery                                       0.82      0.73      0.77      51        
WomensTennisAssociationTournament            0.91      0.93      0.92      88        
WorldHeritageSite                            0.57      0.07      0.13      107       
Wrestler                                     0.89      0.73      0.80      524       
WrestlingEvent                               0.95      0.91      0.93      173       
Writer                                       0.50      0.33      0.40      4675      
WrittenWork                                  0.66      0.81      0.73      235       
Year                                         0.99      0.97      0.98      247       
YearInSpaceflight                            1.00      0.86      0.92      7         
owl#Thing                                    0.52      0.56      0.54      42119     

micro avg                                    0.81      0.81      0.81      500412    
macro avg                                    0.69      0.61      0.64      500412    
weighted avg                                 0.81      0.81      0.81      500412    

語の定義を自動的に抽出する

Wikipediaにおけるエンティティとは、「ジョン・フォン・ノイマン」のようなタイトルを持つ、各々のページのことです。今回は、各々のエンティティを辞書的に定義する文を抽出します。

定義文の例

英語における定義文は、概ね以下の形式に従います。

X is a Y that Z.

例えば以下です。

Anarchism is an anti-authoritarian political philosophy that advocates self-governed societies based on voluntary, cooperative institutions and the rejection of hierarchies those societies view as unjust. These institutions are often described as stateless societies, although several authors have defined them more specifically as distinct institutions based on non-hierarchical or free associations. Anarchism holds the state to be undesirable, unnecessary, and harmful.

これらの文は、各Wikipediaページの最初の文として書かれているので、それを利用して抽出します。

抽出スクリプト

まず、enwikiのダンプを取得し、wikiextractorで展開します。

wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -O dump.bz2
tar xjf dump.bz2
git clone https://github.com/attardi/wikiextractor
cd wikiextractor
python WikiExtractor.py -o extracted ../dump.bz2
cd ..

次に、以下のpythonスクリプトを実行します。

# coding: utf-8
import os
import tqdm
from bs4 import BeautifulSoup

if __name__ == "__main__":
    filepathes = []
    for root, dirs, files in os.walk("./wikiextractor/extracted/"):
        if files and "wiki" in files[0]:
            filepathes += [os.path.join(root, file) for file in files]

    for path in tqdm.tqdm(filepathes):
        with open(path) as f:
            soup = BeautifulSoup(f.read(), "lxml")
        docs = soup.find_all("doc")
        result = [
            doc['title'] + "\t" + doc.get_text().split("\n")[3]
            for doc in docs
        ]
        with open("wiki_definition.txt", "a") as f:
            f.write("\n".join(result))

これで、wiki_definition.txtという定義文一覧が出来上がります。

定義文は何に使えるか

Wikipediaページに対して、何らかのラベルを対応させることができる場合、定義文を特徴量としてラベルを予測する問題として定義できます。

例えば、WikipediaページのDBPedia Instance Typesを予測する問題に置き換えることが可能です。

詳細は以下: https://qiita.com/sugiyamath/items/ef94eb4be5a15b232ef1

また、独自の検索システムを構築しているのであれば、エンティティ名で検索された場合に、そのエンティティの定義文を検索結果に表示させることが可能です。

キャプチャ.GIF

Wikipediaのタイプ予測

DBPedia Instance Typesは、DBPediaのオントロジー情報によって各記事を分類したものです。今回は、Wikipediaの記事をこのDBPedia Instance Typesで分類してみます。

特徴量

  1. Wikipedia記事の定義文の単語の平均ベクトル。
  2. Wikipedia記事のカテゴリー。

実行の流れ

  1. DBPediaのType情報を持っているWikipedia記事を取得し、タイプと記事名を対応付けて保存。
  2. 記事名に対する定義文を抽出。
  3. 記事名に対するカテゴリーを抽出。
  4. 事前訓練済みword2vecを使って定義文の平均ベクトルを生成。
  5. 記事のもつ全カテゴリー名をスペース結合してからBoWする。
  6. 平均ベクトルとBoWを水平結合。
  7. 水平結合したスパース行列をSparseRandomProjectionで次元削減。
  8. タイプ情報をOneHot化してラベルとして使う。
  9. 次元削減された特徴量とラベルを使ってKerasモデルを訓練。
  10. 精度を出す。
  11. 最終的に、このモデルを使って未知のWikipedia記事を、定義文とカテゴリーからInstance Typesで分類できる。

訓練部分のコード

import pickle
import numpy as np
from keras import optimizers
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Input, Dropout
from keras.models import Model
from scipy.sparse import coo_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.random_projection import SparseRandomProjection
from tqdm import tqdm


def build_model(num_labels, inputlen):
    inp = Input(shape=(inputlen, ))
    tmp = Dense(1024, activation="relu", kernel_initializer="he_normal")(inp)
    tmp = Dropout(0.5)(tmp)
    tmp = Dense(1042, activation="relu", kernel_initializer="he_normal")(tmp)
    tmp = Dropout(0.5)(tmp)
    out = Dense(units=num_labels, activation='softmax')(tmp)
    model = Model(inp, out)
    opt = optimizers.Adam(lr=0.001, clipvalue=0.5)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['acc'])
    return model


def gen_data(data, page2cat, kv, labels):
    X = []
    X_cat = []
    y = []
    for d in tqdm(data):
        tmp = []
        tmp_cat = ""
        for cat in page2cat[d["entity"]]:
            tmp_cat += cat.lower() + " "
        tmp_cat = tmp_cat.replace("_", " ")

        for word in d["definition"].replace(".", " ").replace(",",
                                                              " ").split():
            try:
                x = kv.wv[word]
            except Exception:
                continue
            tmp.append(x)
        if tmp:
            X.append(np.mean(tmp, axis=0))
            X_cat.append(tmp_cat)
            y.append(int(labels[d["type"]]))
    vect = CountVectorizer().fit(X_cat)
    X_cat = vect.transform(X_cat)
    X = coo_matrix(X)
    X = hstack([X, X_cat])
    return X, y, vect


def y_fix(y, ylen):
    out = []
    for target in y:
        tmp = np.zeros(ylen)
        tmp[int(target)] = 1.0
        out.append(tmp)
    return out


def generate_data(X, y, batch_size=1000):
    y = np.array(y)
    print(X.shape)
    print(y.shape)
    while True:
        for i in range(int(X.shape[0]/1000)):
            yield X[i*batch_size:(i+1)*batch_size], y[i*batch_size:(i+1)*batch_size]
       


if __name__ == "__main__":
    #print("data loading...")
    #with open("./wiki_definition.json") as f:
    #data = json.load(f)

    #with open("../scripts/labels.json") as f:
    #    tmp = json.load(f)
    #    labels = tmp[1]

    #data = shuffle(data)
    #kv = KeyedVectors.load("./enwiki_model/word2vec.model", mmap="r")
    #with open("./categories.pkl", "rb") as f:
    #    page2cat = pickle.load(f)
    #print("done loading")

    #print("generating data...")
    #X, y, vect = gen_data(data, page2cat, kv, labels)
    #print("done generating")

    #print("saveing data...")
    #with open("data.pkl", "wb") as f:
    #    pickle.dump((X, y, vect), f, protocol=4)
    #print("done saving")

    print("loading")
    with open("data.pkl", "rb") as f:
        X, y, _ = pickle.load(f)
    print("done loading")

    y_len = np.unique(y).shape[0] + 1
    print(y_len)
    y = y_fix(y, y_len)

    print("projection")
    projection = SparseRandomProjection(n_components=1024).fit(X)
    X = projection.transform(X)
    with open("proj.pkl", "wb") as f:
        pickle.dump(projection, f)
    print(X.shape)
    X = X.toarray()
    print("done projection")

    print("training...")
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test)
    model = build_model(y_len, X_train.shape[1])
    print("done model building")
    callbacks = [
        ModelCheckpoint("model.h5",
                        save_best_only=False,
                        monitor="val_loss",
                        mode="min"),
        ModelCheckpoint("model_best.h5",
                        save_best_only=True,
                        monitor="val_loss",
                        mode="min")
    ]

    model.fit_generator(generate_data(X_train, y_train),
                        validation_data=generate_data(X_val, y_val),
                        steps_per_epoch=1000,
                        validation_steps=5,
                        epochs=10,
                        callbacks=callbacks)
    print("done training")

    print("evaluating...")
    y_pred = [np.argmax(x) for x in model.predict(X_test)]
    y_test = [np.argmax(x) for x in y_test]
    with open("eval.txt", "w") as f:
        f.write(classification_report(y_test, y_pred))
    print("done eval")

精度

                                             precision recall    f1-score  support   

AcademicConference                           0.67      0.86      0.75      14        
AcademicJournal                              0.98      0.97      0.97      1252      
Actor                                        0.49      0.49      0.49      810       
AdministrativeRegion                         0.89      0.90      0.90      3979      
AdultActor                                   0.86      0.67      0.75      161       
Aircraft                                     0.99      0.99      0.99      1932      
Airline                                      0.99      0.99      0.99      696       
Airport                                      0.98      0.96      0.97      2509      
Album                                        0.99      1.00      1.00      22773     
AmateurBoxer                                 0.36      0.72      0.48      72        
Ambassador                                   0.46      0.35      0.40      130       
AmericanFootballLeague                       0.64      0.41      0.50      17        
AmericanFootballPlayer                       0.80      0.98      0.88      3584      
AmericanFootballTeam                         0.50      1.00      0.67      5         
Amphibian                                    0.94      0.93      0.94      636       
AmusementParkAttraction                      0.74      0.91      0.82      90        
AnatomicalStructure                          0.60      0.88      0.71      309       
Animal                                       0.79      0.77      0.78      1240      
AnimangaCharacter                            0.89      0.91      0.90      34        
Anime                                        0.63      0.81      0.71      193       
Arachnid                                     0.81      0.93      0.87      745       
Archaea                                      0.80      0.86      0.83      42        
Architect                                    0.52      0.73      0.61      520       
ArchitecturalStructure                       0.47      0.56      0.51      52        
Artery                                       0.93      0.94      0.93      53        
ArtificialSatellite                          0.94      0.99      0.96      438       
Artist                                       0.53      0.52      0.53      2765      
ArtistDiscography                            0.98      0.98      0.98      721       
Artwork                                      0.92      0.94      0.93      911       
Asteroid                                     1.00      0.14      0.25      7         
Astronaut                                    0.82      0.92      0.87      115       
Athlete                                      0.91      0.88      0.90      6782      
AustralianFootballTeam                       0.59      0.93      0.72      70        
AustralianRulesFootballPlayer                0.99      1.00      0.99      2280      
AutoRacingLeague                             0.00      0.00      0.00      2         
Automobile                                   0.95      0.97      0.96      999       
AutomobileEngine                             0.90      0.96      0.93      68        
Award                                        0.89      0.96      0.92      946       
Bacteria                                     0.74      0.15      0.25      187       
BadmintonPlayer                              0.95      0.99      0.97      267       
Band                                         0.86      0.97      0.91      6034      
Bank                                         0.65      0.74      0.69      516       
Baronet                                      0.63      0.88      0.73      130       
BaseballLeague                               0.90      0.86      0.88      51        
BaseballPlayer                               0.98      0.99      0.99      4023      
BaseballSeason                               0.91      0.70      0.79      43        
BaseballTeam                                 0.38      1.00      0.55      3         
BasketballLeague                             0.74      0.89      0.81      98        
BasketballPlayer                             0.95      0.97      0.96      2193      
BasketballTeam                               0.86      0.95      0.90      278       
Bay                                          0.00      0.00      0.00      1         
BeachVolleyballPlayer                        0.83      0.13      0.22      39        
BeautyQueen                                  0.78      0.97      0.87      392       
Beverage                                     0.79      0.83      0.81      179       
BiologicalDatabase                           0.90      0.97      0.94      67        
Bird                                         0.97      0.99      0.98      2455      
BodyOfWater                                  0.66      0.54      0.60      228       
Bodybuilder                                  0.82      0.75      0.78      53        
Bone                                         0.87      0.55      0.67      62        
Book                                         0.96      0.97      0.97      6533      
Boxer                                        0.90      0.91      0.90      733       
Brain                                        0.95      0.70      0.80      99        
Brewery                                      0.25      0.09      0.13      45        
Bridge                                       0.91      0.94      0.92      751       
BroadcastNetwork                             0.65      0.24      0.35      218       
Building                                     0.76      0.80      0.78      8573      
BusCompany                                   0.75      0.90      0.82      256       
BusinessPerson                               0.38      0.05      0.08      130       
CanadianFootballTeam                         0.65      0.70      0.68      37        
Canal                                        0.80      0.94      0.87      79        
Canoeist                                     0.86      0.93      0.89      72        
Cardinal                                     0.47      0.78      0.59      137       
Castle                                       0.61      0.79      0.69      265       
Cave                                         0.66      0.89      0.76      91        
Chancellor                                   0.00      0.00      0.00      19        
Cheese                                       0.95      0.72      0.82      57        
Chef                                         0.71      0.74      0.72      118       
ChemicalCompound                             0.84      0.86      0.85      1869      
ChessPlayer                                  0.81      0.96      0.88      277       
ChristianBishop                              0.77      0.87      0.82      1655      
City                                         0.79      0.83      0.81      3829      
ClassicalMusicArtist                         0.31      0.21      0.25      52        
ClassicalMusicComposition                    0.52      0.74      0.61      122       
Cleric                                       0.76      0.51      0.61      470       
ClubMoss                                     0.00      0.00      0.00      15        
College                                      0.86      0.92      0.89      13        
CollegeCoach                                 0.91      0.91      0.91      1286      
Colour                                       0.92      0.87      0.89      38        
Comedian                                     0.52      0.34      0.41      246       
ComedyGroup                                  0.44      0.33      0.38      12        
Comic                                        0.81      0.94      0.87      374       
ComicStrip                                   0.85      0.81      0.83      62        
ComicsCharacter                              0.93      0.95      0.94      710       
ComicsCreator                                0.65      0.83      0.73      531       
Company                                      0.89      0.88      0.89      9169      
ConcentrationCamp                            0.55      0.73      0.63      15        
Congressman                                  0.47      0.63      0.54      597       
Conifer                                      0.84      0.61      0.71      128       
Constellation                                1.00      0.93      0.97      15        
Continent                                    0.00      0.00      0.00      4         
Convention                                   0.68      0.70      0.69      387       
Country                                      0.78      0.73      0.75      608       
Crater                                       0.82      0.98      0.89      121       
CricketGround                                0.82      0.97      0.89      38        
CricketTeam                                  0.76      0.98      0.86      109       
Cricketer                                    0.98      0.99      0.99      3426      
Criminal                                     0.64      0.67      0.65      424       
Crustacean                                   0.92      0.80      0.86      485       
CultivatedVariety                            0.99      0.93      0.96      307       
Curler                                       0.90      0.99      0.94      150       
CurlingLeague                                0.00      0.00      0.00      2         
Currency                                     0.90      0.91      0.91      69        
Cycad                                        1.00      0.20      0.34      44        
CyclingRace                                  0.87      0.99      0.93      147       
CyclingTeam                                  0.97      0.98      0.98      62        
Cyclist                                      0.96      0.99      0.97      2097      
Dam                                          0.94      0.91      0.93      602       
DartsPlayer                                  0.98      0.93      0.96      105       
Device                                       0.91      0.98      0.94      218       
Diocese                                      0.95      1.00      0.97      601       
Disease                                      0.86      0.95      0.90      1111      
Drug                                         0.82      0.89      0.85      1143      
Earthquake                                   0.97      0.99      0.98      137       
Economist                                    0.49      0.70      0.57      240       
EducationalInstitution                       0.74      0.65      0.70      84        
Election                                     0.68      0.68      0.68      25        
Embryology                                   0.70      0.17      0.28      40        
Engineer                                     0.44      0.53      0.48      137       
Entomologist                                 0.49      0.63      0.55      76        
Enzyme                                       0.97      0.97      0.97      939       
EthnicGroup                                  0.81      0.95      0.88      885       
Eukaryote                                    0.46      0.60      0.52      218       
EurovisionSongContestEntry                   0.83      0.96      0.89      193       
Event                                        0.89      0.55      0.68      820       
Fashion                                      0.54      0.45      0.49      71        
FashionDesigner                              0.50      0.72      0.59      125       
Fern                                         0.85      0.83      0.84      179       
FictionalCharacter                           0.84      0.88      0.86      772       
FieldHockeyLeague                            0.50      0.67      0.57      3         
FigureSkater                                 0.96      0.99      0.97      549       
Film                                         0.96      0.98      0.97      19838     
FilmFestival                                 0.86      0.97      0.91      169       
Fish                                         0.94      0.97      0.95      3295      
FloweringPlant                               0.00      0.00      0.00      23        
Food                                         0.87      0.96      0.92      850       
FootballLeagueSeason                         0.91      0.98      0.94      1674      
FootballMatch                                0.93      0.91      0.92      620       
FormulaOneRacer                              0.91      0.79      0.85      146       
FormulaOneTeam                               0.94      0.94      0.94      32        
Fungus                                       0.96      0.98      0.97      2043      
GaelicGamesPlayer                            0.98      0.99      0.99      670       
Galaxy                                       0.96      1.00      0.98      155       
Game                                         0.93      0.88      0.90      254       
Garden                                       0.76      0.92      0.83      61        
Ginkgo                                       0.00      0.00      0.00      1         
GivenName                                    0.93      0.96      0.95      665       
Glacier                                      0.95      0.92      0.94      129       
Gnetophytes                                  0.00      0.00      0.00      5         
GolfCourse                                   0.89      0.98      0.93      63        
GolfLeague                                   0.50      0.33      0.40      3         
GolfPlayer                                   0.97      1.00      0.98      609       
GolfTournament                               0.99      0.99      0.99      336       
GovernmentAgency                             0.73      0.87      0.79      979       
Governor                                     0.67      0.42      0.52      518       
GrandPrix                                    0.99      0.99      0.99      289       
Grape                                        0.94      1.00      0.97      58        
GreenAlga                                    0.57      0.70      0.63      70        
GridironFootballPlayer                       0.91      0.36      0.52      1166      
Guitarist                                    0.00      0.00      0.00      25        
Gymnast                                      0.94      0.90      0.92      410       
HandballLeague                               1.00      0.25      0.40      4         
HandballPlayer                               0.93      0.97      0.95      400       
HandballTeam                                 0.79      0.97      0.87      72        
Historian                                    0.35      0.21      0.26      115       
HistoricBuilding                             0.80      0.92      0.86      1667      
HistoricPlace                                0.71      0.64      0.67      4491      
HockeyTeam                                   0.94      0.99      0.96      454       
Holiday                                      0.88      0.82      0.85      188       
HollywoodCartoon                             0.91      0.96      0.93      293       
HorseRace                                    0.98      0.99      0.99      447       
HorseRider                                   0.76      0.90      0.82      113       
HorseTrainer                                 0.64      0.62      0.63      37        
Hospital                                     0.92      0.94      0.93      534       
Hotel                                        0.64      0.80      0.71      234       
IceHockeyLeague                              0.83      0.85      0.84      53        
IceHockeyPlayer                              0.98      1.00      0.99      2787      
InformationAppliance                         0.89      0.72      0.80      218       
InlineHockeyLeague                           0.00      0.00      0.00      1         
Insect                                       0.99      0.99      0.99      24521     
Island                                       0.82      0.92      0.86      1136      
Jockey                                       0.77      0.86      0.81      102       
Journalist                                   0.46      0.28      0.35      280       
Judge                                        0.53      0.53      0.53      548       
LacrosseLeague                               1.00      0.78      0.88      9         
LacrossePlayer                               0.94      0.97      0.95      62        
Lake                                         0.91      0.95      0.93      1779      
Language                                     0.96      0.97      0.97      1487      
LaunchPad                                    0.76      0.90      0.83      21        
LawFirm                                      0.91      0.79      0.85      66        
Legislature                                  0.82      0.92      0.87      332       
Library                                      0.82      0.76      0.79      197       
Ligament                                     0.82      0.72      0.77      32        
Lighthouse                                   0.96      0.98      0.97      341       
Locomotive                                   0.99      0.98      0.99      577       
Lymph                                        0.93      0.87      0.90      15        
Magazine                                     0.86      0.93      0.90      871       
Mammal                                       0.93      0.95      0.94      1499      
Manga                                        0.96      0.87      0.92      570       
MartialArtist                                0.92      0.92      0.92      604       
Mayor                                        0.46      0.47      0.47      307       
Medician                                     0.40      0.07      0.12      88        
MemberOfParliament                           0.68      0.79      0.73      1385      
MilitaryConflict                             0.95      0.97      0.96      2443      
MilitaryPerson                               0.80      0.90      0.85      5016      
MilitaryStructure                            0.67      0.73      0.70      803       
MilitaryUnit                                 0.95      0.96      0.96      3174      
Mineral                                      0.98      0.92      0.95      266       
MixedMartialArtsEvent                        0.95      0.99      0.97      136       
Model                                        0.74      0.60      0.66      309       
Mollusca                                     0.99      0.99      0.99      4859      
Monarch                                      0.51      0.62      0.56      409       
Monument                                     0.44      0.54      0.49      107       
Moss                                         0.89      0.18      0.30      89        
Motorcycle                                   0.98      0.98      0.98      212       
MotorcycleRacingLeague                       0.00      0.00      0.00      5         
MotorcycleRider                              0.92      0.98      0.95      211       
MotorsportSeason                             0.96      0.99      0.97      534       
Mountain                                     0.91      0.93      0.92      3025      
MountainPass                                 0.93      0.95      0.94      189       
MountainRange                                0.82      0.87      0.84      461       
Murderer                                     0.00      0.00      0.00      16        
Muscle                                       1.00      0.98      0.99      47        
Museum                                       0.79      0.87      0.83      989       
MusicFestival                                0.48      0.80      0.60      69        
MusicGenre                                   0.67      0.91      0.77      221       
Musical                                      0.91      0.97      0.94      246       
MusicalArtist                                0.70      0.88      0.78      9249      
MusicalWork                                  0.47      0.69      0.56      52        
MythologicalFigure                           0.67      0.94      0.78      151       
NCAATeamSeason                               0.99      1.00      1.00      3100      
NascarDriver                                 0.82      0.95      0.88      166       
NationalCollegiateAthleticAssociationAthlete 0.50      0.03      0.05      36        
NationalFootballLeagueEvent                  0.33      0.50      0.40      2         
NationalFootballLeagueSeason                 0.97      0.99      0.98      635       
Nerve                                        0.93      0.74      0.83      58        
NetballPlayer                                0.95      0.92      0.94      39        
Newspaper                                    0.94      0.92      0.93      1189      
Noble                                        0.59      0.57      0.58      870       
Non-ProfitOrganisation                       0.46      0.02      0.04      328       
Novel                                        0.00      0.00      0.00      6         
OfficeHolder                                 0.65      0.76      0.70      12280     
OlympicEvent                                 0.96      1.00      0.98      809       
OlympicResult                                0.85      0.99      0.92      149       
Olympics                                     0.88      0.78      0.82      9         
Organisation                                 0.65      0.78      0.71      3162      
Painter                                      0.58      0.59      0.58      469       
Park                                         0.76      0.67      0.71      699       
Person                                       0.68      0.68      0.68      33152     
Philosopher                                  0.33      0.63      0.43      361       
Photographer                                 0.36      0.29      0.32      92        
Place                                        0.90      0.65      0.75      1348      
Planet                                       0.98      0.99      0.99      663       
Plant                                        0.96      0.98      0.97      9790      
Play                                         0.83      0.93      0.88      373       
PlayboyPlaymate                              0.88      0.79      0.84      48        
Poem                                         0.59      0.60      0.59      62        
Poet                                         0.32      0.37      0.34      49        
PokerPlayer                                  0.98      0.96      0.97      130       
PoliticalParty                               0.93      0.96      0.95      1343      
Politician                                   0.64      0.49      0.56      3440      
PoloLeague                                   0.75      0.43      0.55      7         
Pope                                         0.86      0.97      0.91      63        
PowerStation                                 0.89      0.95      0.92      386       
Presenter                                    0.00      0.00      0.00      21        
President                                    0.61      0.20      0.30      417       
PrimeMinister                                0.41      0.29      0.34      249       
Prison                                       0.96      0.98      0.97      260       
ProgrammingLanguage                          0.86      0.53      0.66      180       
ProtectedArea                                0.90      0.87      0.88      1738      
Protein                                      0.81      0.84      0.83      416       
PublicTransitSystem                          0.63      0.54      0.58      337       
Publisher                                    0.81      0.67      0.73      263       
RaceHorse                                    1.00      0.99      1.00      742       
Racecourse                                   0.78      0.84      0.81      45        
RacingDriver                                 0.82      0.89      0.86      455       
RadioHost                                    0.26      0.14      0.18      74        
RadioProgram                                 0.86      0.91      0.89      234       
RadioStation                                 0.98      0.99      0.99      3641      
RailwayLine                                  0.80      0.91      0.85      645       
RailwayStation                               0.84      0.82      0.83      315       
RailwayTunnel                                0.88      0.68      0.77      41        
RecordLabel                                  0.90      0.96      0.93      576       
Religious                                    0.73      0.56      0.64      199       
ReligiousBuilding                            0.80      0.67      0.73      794       
Reptile                                      0.95      0.91      0.93      878       
ResearchProject                              0.00      0.00      0.00      1         
Restaurant                                   0.84      0.84      0.84      216       
River                                        0.98      0.99      0.99      5047      
Road                                         0.97      0.99      0.98      3725      
RoadJunction                                 0.95      0.61      0.75      31        
RoadTunnel                                   0.67      0.96      0.79      52        
Rocket                                       0.74      0.81      0.78      43        
RollerCoaster                                0.99      0.98      0.99      122       
Rower                                        0.94      0.89      0.91      53        
Royalty                                      0.70      0.84      0.76      1833      
RugbyClub                                    0.92      0.95      0.94      418       
RugbyLeague                                  0.87      0.87      0.87      98        
RugbyPlayer                                  0.98      0.99      0.99      3033      
Saint                                        0.81      0.93      0.86      710       
School                                       0.95      0.97      0.96      5821      
Scientist                                    0.58      0.60      0.59      4383      
ScreenWriter                                 0.55      0.26      0.35      138       
Sea                                          0.43      0.38      0.40      8         
Senator                                      0.33      0.45      0.38      141       
Settlement                                   0.92      0.95      0.93      43480     
Ship                                         0.99      0.99      0.99      5209      
ShoppingMall                                 0.97      0.96      0.96      428       
Single                                       0.96      0.98      0.97      8900      
SiteOfSpecialScientificInterest              0.85      0.97      0.91      191       
Skater                                       0.76      0.93      0.84      99        
SkiArea                                      0.90      0.87      0.89      107       
Skier                                        0.82      0.89      0.85      449       
SnookerChamp                                 0.00      0.00      0.00      5         
SnookerPlayer                                0.74      0.96      0.84      76        
SoapCharacter                                0.96      0.91      0.93      441       
SoccerClub                                   0.97      0.99      0.98      3770      
SoccerClubSeason                             0.97      0.99      0.98      1737      
SoccerLeague                                 0.79      0.90      0.84      318       
SoccerManager                                0.93      0.79      0.86      3417      
SoccerPlayer                                 0.96      0.99      0.98      20789     
SoccerTournament                             0.78      0.94      0.85      1232      
SoftballLeague                               0.67      0.40      0.50      5         
Software                                     0.86      0.90      0.88      1926      
SolarEclipse                                 0.97      1.00      0.98      62        
Song                                         0.77      0.62      0.69      1124      
SpaceShuttle                                 0.00      0.00      0.00      6         
SpaceStation                                 0.75      0.60      0.67      5         
Spacecraft                                   0.00      0.00      0.00      3         
Species                                      0.79      0.83      0.81      1108      
SpeedwayLeague                               0.75      0.75      0.75      4         
SpeedwayRider                                0.98      0.98      0.98      135       
SpeedwayTeam                                 0.81      0.91      0.86      23        
Sport                                        0.85      0.52      0.65      42        
SportsEvent                                  0.80      0.84      0.82      215       
SportsLeague                                 0.54      0.48      0.51      77        
SportsTeam                                   0.79      0.70      0.74      455       
SportsTeamMember                             0.97      0.61      0.75      135       
SquashPlayer                                 0.95      0.93      0.94      81        
Stadium                                      0.63      0.58      0.60      1058      
Star                                         0.99      0.98      0.99      561       
Station                                      0.98      0.98      0.98      5128      
Stream                                       0.00      0.00      0.00      9         
SumoWrestler                                 0.96      0.96      0.96      80        
SupremeCourtOfTheUnitedStatesCase            0.98      1.00      0.99      500       
Surname                                      0.49      0.69      0.57      80        
Swimmer                                      0.86      0.98      0.92      1096      
TableTennisPlayer                            0.83      0.93      0.88      88        
TelevisionEpisode                            0.98      0.98      0.98      1546      
TelevisionHost                               0.00      0.00      0.00      13        
TelevisionSeason                             0.88      0.94      0.91      640       
TelevisionShow                               0.90      0.95      0.93      6608      
TelevisionStation                            0.90      0.98      0.94      1334      
TennisLeague                                 0.00      0.00      0.00      3         
TennisPlayer                                 0.97      0.99      0.98      951       
TennisTournament                             0.95      0.99      0.97      213       
Theatre                                      0.46      0.28      0.35      127       
Town                                         0.88      0.78      0.83      7785      
TradeUnion                                   0.91      0.86      0.89      298       
Train                                        0.90      0.94      0.92      272       
Tunnel                                       0.00      0.00      0.00      24        
University                                   0.92      0.90      0.91      3383      
Valley                                       0.76      0.57      0.65      23        
Vein                                         0.97      0.88      0.93      43        
Venue                                        0.51      0.56      0.53      1085      
VideoGame                                    0.99      0.99      0.99      3492      
VideogamesLeague                             0.00      0.00      0.00      1         
Village                                      0.96      0.97      0.97      30851     
VoiceActor                                   0.36      0.41      0.38      22        
Volcano                                      0.64      0.88      0.74      131       
VolleyballCoach                              0.60      0.21      0.32      14        
VolleyballLeague                             1.00      0.69      0.82      13        
VolleyballPlayer                             0.88      0.99      0.93      676       
WaterRide                                    0.83      0.29      0.43      17        
WaterwayTunnel                               0.57      1.00      0.73      4         
Weapon                                       0.92      0.95      0.93      893       
Website                                      0.63      0.60      0.61      609       
WineRegion                                   0.89      0.85      0.87      67        
Winery                                       0.86      0.90      0.88      63        
WomensTennisAssociationTournament            0.92      0.98      0.95      94        
WorldHeritageSite                            0.45      0.54      0.49      122       
Wrestler                                     0.93      0.89      0.91      618       
WrestlingEvent                               0.96      1.00      0.98      226       
Writer                                       0.49      0.55      0.52      5418      
WrittenWork                                  0.84      0.93      0.89      272       
Year                                         0.96      0.97      0.97      281       
YearInSpaceflight                            1.00      0.92      0.96      12        
owl#Thing                                    0.68      0.51      0.58      49682     

micro avg                                    0.86      0.86      0.86      586369    
macro avg                                    0.75      0.75      0.74      586369    
weighted avg                                 0.86      0.86      0.86      586369    

Word-sense disambiguation: 語義の曖昧性解消問題

曖昧性解消とは、ある単語が複数の意味を持つとき、文脈に応じて適切なエンティティを選択することです。今回は、前回作成したスクリプトの続きとして作成します。

仕組み

無題の図形描画.jpg

  1. node=(entity, mention), edge=(node1, node2)とした共起グラフを構築。
  2. あるメンションに紐づくエンティティリストをn個取得。
  3. そのメンションとエンティティのペアn個(e1,m),(e2,m),...,(en,m)を共起グラフから各々検索。
  4. 各々の検索によって、紐付いているノードを取得し、ノードの件数と、ノードのメンションを取得。
  5. ノードの件数を使ってある値pを計算。
  6. ノードに紐づくメンションの平均ベクトルと、文脈の平均ベクトルのコサイン類似度を値qとする。
  7. ある値alphaに対し、r = alphap+(1-alpha)qを計算。
  8. rを正規化.

実行

前回のスクリプト実行フロー

git clone https://github.com/sugiyamath/entity_types_scripts
cd entity_types_scripts
wget http://downloads.dbpedia.org/2016-10/core/instance_types_en.ttl.bz2
wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
bunzip2 *.bz2
mv enwiki-latest-pages-articles.xml dump
python build_types.py
python extract_mention.py
python json2marisa.py
python mprob.py "Obama"

以下のスクリプトは、前回作成したスクリプトをすべて実行した上で実行してください。

python mention_and_graph.py
python pkl2marisa.py
python extract_graph.py
python build_graph.py
python create_index.py
pythoon eprob.py "Galileo" "Japan,TV" 0.4

実行例

python eprob.py Galileo Japan,TV 0.4
{'Galileo_(1968_film)': 0.08162450203592601,
 'Galileo_(1975_film)': 0.07264921641693314,
 'Galileo_(1994_film)': 0.06521374088908712,
 'Galileo_(TV_series)': 0.18111023032336127,
 'Galileo_(horse)': 0.09737876204706845,
 'Galileo_(operating_system)': 0.04316242003252071,
 'Galileo_(satellite_navigation)': 0.13500750427325464,
 'Galileo_(song)': 0.03378523117010994,
 'Galileo_(spacecraft)': 0.12453115091750815,
 'Galileo_Galilei': 0.07404413908764415,
 'Intel_Galileo': 0.06936555756415227,
 'Life_of_Galileo': 0.022127545242434238}

python eprob.py Galileo person,history 0.4
{'Galileo_(1968_film)': 0.044064635682177,
 'Galileo_(1975_film)': -0.03600631439106292,
 'Galileo_(1994_film)': 0.15265454663089653,
 'Galileo_(TV_series)': 0.0013680732885760023,
 'Galileo_(horse)': 0.10470165517486565,
 'Galileo_(operating_system)': 0.0005596730220754116,
 'Galileo_(satellite_navigation)': 0.0738529599122509,
 'Galileo_(song)': 0.05449777719438314,
 'Galileo_(spacecraft)': 0.16406225919086243,
 'Galileo_Galilei': 0.37058668171965764,
 'Intel_Galileo': 0.07787230258914513,
 'Life_of_Galileo': -0.00821425001382678}

コードの中身

mention_and_graph.py

(entity, mention) というペアをキーとしてIDを振ります。

# coding: utf-8
import re
from tqdm import tqdm
import pickle


def extract_mention_and_entity(exp):
    tmp = exp[2:-2]
    tmp2 = tmp[0].upper() + tmp[1:]
    if "|" in tmp2:
        entity, mention = tmp2.split("|")
        mention = mention.strip()
    else:
        entity = tmp2[:]
        mention = tmp[:]
    entity = entity.strip()
    entity = entity.replace(" ", "_")
    return entity, mention


if __name__ == "__main__":
    reg = re.compile(r"\[\[.+?\]\]")
    out = {}
    counter = 0
    with open("dump", errors='ignore') as f1:
        for line in tqdm(f1):
            ents = []
            mentions = []
            for x in re.findall(reg, line):
                try:
                    entity, mention = extract_mention_and_entity(x)
                except Exception:
                    continue
                key = (entity, mention)
                if key in out:
                    continue
                out[key] = counter
                counter += 1

    with open("me2id.pkl", "wb") as f2:
        pickle.dump(out, f2)

pkl2marisa.py

me2id.pklをmarisaに変換します。

from marisa_trie import BytesTrie
import pickle
from tqdm import tqdm

if __name__ == "__main__":
    with open("./me2id.pkl", "rb") as f:
        em2id = pickle.load(f)

    trie = BytesTrie(
        [(str(x[0]), bytes(str(x[1]), 'utf-8')) for x in tqdm(em2id.items())])
    trie.save("em2id.marisa")

    trie = BytesTrie(
        [(str(x[1]), bytes(str(x[0]), 'utf-8'))
         for x in tqdm(em2id.items())])
    trie.save("id2em.marisa")

extract_graph.py

共起するメンション,エンティティのペアのグラフをtxt形式で出力します。

# coding: utf-8
import re
from tqdm import tqdm
import pickle
from marisa_trie import BytesTrie


def extract_mention_and_entity(exp):
    tmp = exp[2:-2]
    tmp2 = tmp[0].upper() + tmp[1:]
    if "|" in tmp2:
        entity, mention = tmp2.split("|")
        mention = mention.strip()
    else:
        entity = tmp2[:]
        mention = tmp[:]
    entity = entity.strip()
    entity = entity.replace(" ", "_")
    return entity, mention


if __name__ == "__main__":
    trie = BytesTrie()
    trie.load("./types.marisa")
    with open("me2id.pkl", "rb") as f:
        em2id = pickle.load(f)

    reg = re.compile(r"\[\[.+?\]\]")
    out = {}
    with open("dump", errors='ignore') as f1:
        with open("graph.txt", "w") as f2:
            for line in tqdm(f1):
                ents = []
                mentions = []
                for x in re.findall(reg, line):
                    try:
                        entity, mention = extract_mention_and_entity(x)
                        trie[entity]
                        ents.append(entity)
                        mentions.append(mention)
                    except Exception:
                        continue
                try:
                    assert len(ents) == len(mentions)
                except AssertionError:
                    continue

                pairs = sorted(list(zip(ents, mentions)))
                for i in range(len(pairs) - 1):
                    for j in range(i + 1, len(pairs)):
                        pair = [em2id[tuple(pairs[i])], em2id[tuple(pairs[j])]]
                        f2.write(str(pair[0]) + "\t" + str(pair[1]) + "\n")

build_graph.py

txtからsqliteへ格納します。

from tqdm import tqdm
import sqlite3


def create_table(conn):
    c = conn.cursor()
    sql = """
create table if not exists graph (
    id integer primary key,
    from_id integer NOT NULL,
    to_id integer NOT NULL
);
"""
    c.execute(sql)


def insert_graph(conn, f, t):
    c = conn.cursor()
    sql = "insert into graph(from_id,to_id) values (?,?)"
    c.execute(sql, (f, t))


if __name__ == "__main__":
    debug = False
    conn = sqlite3.connect("db.sqlite3")
    create_table(conn)
    with open("graph.txt") as f:
        for line in tqdm(f):
            line = list(map(int, line.strip().split("\t")))
            insert_graph(conn, line[0], line[1])
            insert_graph(conn, line[1], line[0])
    conn.commit()

create_index.py

インデクスを作成します。

import sqlite3


def create_index(conn):
    c = conn.cursor()
    sql1 = "create index index_from_id_graph on graph(from_id)"
    sql2 = "create index index_to_id_graph on graph(to_id)"
    c.execute(sql1)
    c.execute(sql2)
    conn.commit()


if __name__ == "__main__":
    conn = sqlite3.connect("./db.sqlite3")
    create_index(conn)

comention.py

モジュール内で使われるサブモジュールです。

# coding: utf-8
from marisa_trie import BytesTrie
import sqlite3
import json
from typing import Dict, Tuple, List


def get_comentions(conn: sqlite3.Connection,
                   mention: str,
                   mstat: BytesTrie,
                   em2id: BytesTrie,
                   id2em: BytesTrie) -> Dict[str, List[Tuple[str, str]]]:
    out = {}
    c = conn.cursor()
    sql = "select to_id from graph where from_id=?"
    es = list(json.loads((mstat[mention][0].decode())).keys())
    ids = [int(em2id[str((e, mention))][0]) for e in es]
    for idx, e in zip(ids, es):
        c.execute(sql, (idx, ))
        tmp = [eval(id2em[str(x[0])][0].decode()) for x in c.fetchall()]
        if tmp:
            out[e] = tmp
    return out


if __name__ == "__main__":
    import sys
    from pprint import pprint
    conn = sqlite3.connect("./db.sqlite3")
    trie = BytesTrie().load("./mention_stat.marisa")
    trie2 = BytesTrie().load("./em2id.marisa")
    trie3 = BytesTrie().load("./id2em.marisa")
    pprint(get_comentions(conn, sys.argv[1], trie, trie2, trie3))

eprob.py

モジュール本体です。

#coding: utf-8
import sqlite3
import numpy as np
from marisa_trie import BytesTrie
from comention import get_comentions


def calc_prob1(coms):
    out = {}
    total = 0
    for k, v in coms.items():
        val = len(v)
        out[k] = val
        total += val
    return {k: float(v) / float(total) for k, v in out.items()}


def cossim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


def softmax_dict(d):
    ks, vs = [], []
    for k, v in d.items():
        ks.append(k)
        vs.append(v)
    vs = np.array(vs)
    m = np.max(vs)
    ex = np.exp(vs - m)
    result = ex / ex.sum(axis=0)
    return dict(zip(ks, result.tolist()))


def calc_prob2(model, coms, contexts):
    out = {}
    w = []
    for c in contexts:
        try:
            w.append(model.wv[c])
        except Exception:
            continue
    w = np.mean(w, axis=0)
    for k, xss in coms.items():
        v = []
        for xs in xss:
            for x in xs[1].split():
                try:
                    v.append(model.wv[x])
                except Exception:
                    continue
        v = np.mean(v, axis=0)
        out[k] = cossim(v, w)
    return out


def calc_entityprob(prob1, prob2, alpha=0.5):
    out = {}
    total = 0
    for k, v1 in prob1.items():
        v2 = prob2[k]
        value = alpha * v1 + (1 - alpha) * v2
        out[k] = value
        total += value
    return {k: float(v) / float(total) for k, v in out.items()}


def compute(mention, contexts, model, conn, mstat, em2id, id2em, alpha=0.5):
    coms = get_comentions(conn, mention, mstat, em2id, id2em)
    prob1 = calc_prob1(coms)
    prob2 = calc_prob2(model, coms, contexts)
    return calc_entityprob(prob1, prob2, alpha)


if __name__ == "__main__":
    import sys
    from pprint import pprint
    from gensim.models import KeyedVectors
    mention = sys.argv[1]
    contexts = sys.argv[2].split(",")
    alpha = float(sys.argv[3])
    conn = sqlite3.connect("./db.sqlite3")
    trie = BytesTrie().load("./mention_stat.marisa")
    trie2 = BytesTrie().load("./em2id.marisa")
    trie3 = BytesTrie().load("./id2em.marisa")
    model = KeyedVectors.load("./enwiki_model/word2vec.model", mmap="r")
    pprint(compute(mention, contexts, model, conn,
                   trie, trie2, trie3, alpha=alpha))

type probability: エンティティリンキングに使えそうな特徴量

DBPediaには、各エンティティのタイプ情報を持つデータがあります。今回は、Wikipediaから各メンションがどのエンティティと紐付いているかを統計的に算出し、その上でDBPediaと結びつけることで、語のタイプ確率を求めます。

実行

git clone https://github.com/sugiyamath/entity_types_scripts
cd entity_types_scripts
wget http://downloads.dbpedia.org/2016-10/core/instance_types_en.ttl.bz2
wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
bunzip2 *.bz2
mv enwiki-latest-pages-articles.xml dump
python build_types.py
python extract_mention.py
python json2marisa.py
python mprob.py "Obama"

出力例

python mprob.py "Obama"
{'Animal': 0.007423904974016332,
 'City': 0.05493689680772086,
 'MusicalArtist': 0.0007423904974016332,
 'President': 0.8916109873793615,
 'RailwayLine': 0.0007423904974016332,
 'School': 0.0007423904974016332,
 'SoccerPlayer': 0.011878247958426132,
 'owl#Thing': 0.03192279138827023}

 python mprob.py "Einstein"
{'Album': 0.003913894324853229,
 'ArtificialSatellite': 0.0136986301369863,
 'Congressman': 0.0019569471624266144,
 'Film': 0.0019569471624266144,
 'InformationAppliance': 0.0019569471624266144,
 'Person': 0.009784735812133072,
 'RaceHorse': 0.05675146771037182,
 'School': 0.005870841487279843,
 'Scientist': 0.8473581213307241,
 'Software': 0.009784735812133072,
 'Song': 0.007827788649706457,
 'Station': 0.0136986301369863,
 'TelevisionShow': 0.01761252446183953,
 'University': 0.005870841487279843,
 'owl#Thing': 0.0019569471624266144}

 python mprob.py "Kyoto"
{'AdministrativeRegion': 0.08986175115207373,
 'Award': 0.00030721966205837174,
 'City': 0.8935483870967742,
 'Company': 0.00030721966205837174,
 'Country': 0.00015360983102918587,
 'Diocese': 0.00015360983102918587,
 'GolfTournament': 0.00030721966205837174,
 'HistoricBuilding': 0.00030721966205837174,
 'Museum': 0.0006144393241167435,
 'PublicTransitSystem': 0.0004608294930875576,
 'Racecourse': 0.0015360983102918587,
 'RailwayLine': 0.0009216589861751152,
 'Single': 0.001075268817204301,
 'SoccerClub': 0.00030721966205837174,
 'Song': 0.001228878648233487,
 'Station': 0.0006144393241167435,
 'University': 0.004301075268817204,
 'WorldHeritageSite': 0.0004608294930875576,
 'owl#Thing': 0.003533026113671275}

 python mprob.py "Google"
{'Company': 0.9876835622927522,
 'InformationAppliance': 0.00023685457129322596,
 'Organisation': 7.895152376440866e-05,
 'Software': 0.0007105637138796779,
 'Website': 0.010342649613137533,
 'owl#Thing': 0.0009474182851729038}

python mprob.py "Python"
{'ComedyGroup': 0.005298318359824925,
 'Film': 0.0055286800276434,
 'ProgrammingLanguage': 0.9518544114259387,
 'Reptile': 0.008062658373646624,
 'RollerCoaster': 0.0034554250172771253,
 'Software': 0.000691085003455425,
 'TelevisionShow': 0.000230361667818475,
 'Weapon': 0.00598940336328035,
 'owl#Thing': 0.01888965676111495}

python mprob.py "Andy Hunt"
{'SoccerPlayer': 0.8780487804878049, 'Writer': 0.12195121951219512}

コードの中身

build_types.py

dbpediaのtypesをjsonへ。

import re
from tqdm import tqdm
import json

def load(filename):
    out = {}
    out2 = {}
    with open(filename) as f:
        for line in tqdm(f):
            if line.startswith("<") and '__' not in line:
                line = line.split()
                entity = line[0].split("/")[-1][:-1]
                if entity not in out:
                    out[entity] = []
                out[entity].append(line[2].split("/")[-1][:-1])
    for k,vs in tqdm(out.items()):
        out2[k] = list(set(vs))
    return out2


def save(out, filename):
    with open(filename, "w") as f:
        json.dump(out, f, indent=4, sort_keys=True)

if __name__ == "__main__":
    save(load("./instance_types_en.ttl"), "out.json")

extract_mention.py

wikipediaのダンプからアンカーを取り出して、「表現(mention)」と「エンティティ(Wikipediaページ名)」に分けて、「ある表現があるエンティティである回数」の統計をとる。

# coding: utf-8
import re
from tqdm import tqdm

def extract_mention_and_entity(exp):
    tmp = exp[2:-2]
    tmp2 = tmp[0].upper() + tmp[1:]
    if "|" in tmp2:
        entity, mention = tmp2.split("|")
        mention = mention.strip()
    else:
        entity = tmp2[:]
        mention = tmp[:]
    entity = entity.strip()
    entity = entity.replace(" ", "_")
    return entity, mention

if __name__ == "__main__":
    import json
    reg = re.compile(r"\[\[.+?\]\]")
    out = {}
    with open("dump", errors='ignore') as f:
        for line in tqdm(f):
            exps = re.findall(reg, line)
            for exp in exps:
                try:
                    entity, mention = extract_mention_and_entity(exp)
                except:
                    continue
                if mention in out:
                    if entity in out[mention]:
                        out[mention][entity] += 1
                    else:
                        out[mention][entity] = 1
                else:
                    out[mention] = {}

    with open("mention_stat.json", "w") as f:
        json.dump(out, f)

json2marisa.py

生成したjsonデータをmarisa_trieへ変換。

import json
import sys
from marisa_trie import BytesTrie

if __name__ == "__main__":
    print("load types")
    with open("./types.json") as f:
        data = json.load(f)

    print("types to trie")
    trie = BytesTrie([(k,bytes(json.dumps(v), "utf-8")) for k,v in data.items()])

    print("saving...")
    trie.save("types.marisa")

    print("load mention_stat")
    with open("./mention_stat.json") as f:
        data = json.load(f)

    print("mention_stat to trie")
    trie = BytesTrie([(k,bytes(json.dumps(v), "utf-8")) for k,v in data.items()])

    print("saving...")
    trie.save("mention_stat.marisa")

    print("Done!")

mprob.py

モジュール本体。

# coding: utf-8
from typing import Tuple, Dict
from marisa_trie import BytesTrie
import json

Probs = Dict[str, float]

def typeprob(mention:str, mstat:BytesTrie, types:BytesTrie) -> Probs:
    """
    Calculate type probabilities of the mention.
    Returns probabilities as dictionary (keys are type, values are prob).
    pre: len(mention) > 0
    pre: len(mstat[mention]) > 0
    pre: type(json.loads(mstat[mention][0].decode()))) == dict
    """
    
    total = 0
    prob = {}
    stat = json.loads(mstat[mention][0].decode())
    for k,v in stat.items():
        try:
            enttypes = json.loads(types[k][0].decode())
        except:
            continue
        for enttype in enttypes:
            if enttype not in prob:
                prob[enttype] = 0
            prob[enttype] += int(v)
            total += int(v)
    return dict([(k,float(v)/float(total)) for k,v in prob.items()])


if __name__ == "__main__":
    import sys
    import pprint
    mstat = BytesTrie()
    types = BytesTrie()
    mstat.load("./mention_stat.marisa")
    types.load("./types.marisa")
    pprint.pprint(typeprob(sys.argv[1], mstat, types))

注意

このプログラムの実行には8G程度のメモリが必要です。