ナード戦隊データマン

データサイエンスを用いて悪と戦うぞ

感情分析: 英語で訓練するだけで数十言語に対応する

LASER1というSentence Encoderを使えば、zero-shotで感情分析の多言語モデルを作成可能だと思ったので、試してみます。

実行フロー

  1. LASERとデータの準備
  2. Sentence Encoderのpythonモジュール(ラッパー)を作成。
  3. 英語で単純なKerasによるDNNモデルを訓練。なお、ツイート文はLASERでエンコードした上でモデルの入力に渡す。
  4. 英語とアラビア語でテスト。

事前準備

git clone https://github.com/facebookresearch/LASER
cd LASER
mkdir laser
ln -s ${HOME}/LASER/source ${HOME}/LASER/laser/source
ln -s ${HOME}/LASER/tasks ${HOME}/LASER/laser/tasks
export LASER="${HOME}/LASER/laser"
pip install transliterate jieba
./install_models.sh
./install_external_tools.sh
cd laser
mkdir s140

ついで、以下の2つのデータセットをダウンロードします。

コード

モジュール (embed_hander.py)

import os
import sys
from random import choices
from string import ascii_uppercase, digits
from subprocess import check_output
from tempfile import NamedTemporaryFile, TemporaryDirectory
 
import numpy as np
 
assert os.environ.get('LASER'), 'Please set the enviornment variable LASER'
LASER = os.environ['LASER']
 
sys.path.append(LASER + '/source/lib')
sys.path.append(LASER + '/source')
 
from embed import SentenceEncoder, Token, BPEfastApply, EncodeFile
 
 
def prepare_model():
    max_tokens = 12000
    max_sentences = None
 
    encoder = SentenceEncoder(
        os.path.join(LASER, "models/bilstm.93langs.2018-12-26.pt"),
        max_sentences=max_sentences,
        max_tokens=max_tokens,
        sort_kind='mergesort',
        cpu=False)
    return encoder
 
 
def encode_them(encoder, ifname, ofname, lang):
    bpe_codes = os.path.join(LASER, "models/93langs.fcodes")
    buffer_size = 10000
    buffer_size = max(buffer_size, 1)
    with TemporaryDirectory() as tmpdir:
        if lang != '--':
            tok_fname = os.path.join(tmpdir, 'tok')
            Token(ifname,
                  tok_fname,
                  lang=lang,
                  romanize=True if lang == 'el' else False,
                  lower_case=True,
                  gzip=False,
                  verbose=True,
                  over_write=False)
            ifname = tok_fname
 
        if bpe_codes:
            bpe_fname = os.path.join(tmpdir, 'bpe')
            BPEfastApply(ifname,
                         bpe_fname,
                         bpe_codes,
                         verbose=True,
                         over_write=False)
            ifname = bpe_fname
 
        EncodeFile(encoder,
                   ifname,
                   ofname,
                   verbose=True,
                   over_write=False,
                   buffer_size=buffer_size)
 
 
def compute_emb(encoder, text, lang):
    dim = 1024
    input_name = None
    output_name = ''.join(choices(ascii_uppercase + digits, k=16))
    with NamedTemporaryFile(mode="w+t", delete=False) as input_file:
        input_file.write(text)
        input_name = input_file.name
        print(input_name)
    encode_them(encoder, input_name, output_name, lang)
    X = np.fromfile(output_name, dtype=np.float32, count=-1)
    X.resize(X.shape[0] // dim, dim)
    os.remove(input_name)
    os.remove(output_name)
    return X

英語で訓練

import pandas as pd
from embed_handler import prepare_model, compute_emb
from keras.layers import Dense, Dropout
from keras.models import Sequential
 
 
def preprocessing(sents, encoder):
    return compute_emb(
        encoder, '\n'.join([sent.replace("\n", " ") for sent in sents]),
        "en")
 
 
def build_model():
    model = Sequential([
        Dense(1024, input_shape=(1024, )),
        Dense(1024, activation="relu", kernel_initializer="he_normal"),
        Dropout(0.5),
        Dense(1, activation="sigmoid", kernel_initializer="normal")
    ])
    model.compile(loss="binary_crossentropy",
                  optimizer="adam",
                  metrics=["acc"])
    return model
 
 
if __name__ == "__main__":
    model = build_model()
    df = pd.read_csv("./data/english/training.1600000.processed.noemoticon.csv",
                     header=None,
                     encoding="latin")
    y = df[0] > 0
    sents = df[5]
    encoder = prepare_model()
    X = preprocessing(sents, encoder)
    model.fit(X, y, epochs=2)
    model.save("model.h5")

テスト

英語のテスト

import pandas as pd
from embed_handler import prepare_model, compute_emb
from keras.models import load_model
from sklearn.metrics import classification_report
 
 
def preprocessing(sents, encoder):
    return compute_emb(encoder,
                       '\n'.join([sent.replace("\n", " ") for sent in sents]),
                       "en")
 
 
if __name__ == "__main__":
    df = pd.read_csv("./data/english/testdata.manual.2009.06.14.csv",
                     header=None,
                     encoding="latin")
    df = df[df[0] != 2]
    y = df[0] > 0
    sents = df[5]
    encoder = prepare_model()
    X = preprocessing(sents, encoder)
    model = load_model("./model.h5")
    y_pred = model.predict_classes(X)
    print(classification_report(y, y_pred))

アラビア語のテスト

from embed_handler import prepare_model, compute_emb
from keras.models import load_model
import os
from tqdm import tqdm
 
 
def load_data(path="./data/arabic/"):
    y_test = []
    X_test = []
    for x in tqdm(os.listdir(path)):
        target = os.path.join(path, x)
        if os.path.isfile(target):
            with open(target, encoding="utf-8") as f:
                if "negative" in str(target):
                    label = False
                elif "positive" in str(target):
                    label = True
                try:
                    sent = f.read().replace("\n", "")
                    X_test.append(sent)
                    y_test.append(label)
                except Exception:
                    continue
    return X_test, y_test
 
 
if __name__ == "__main__":
    from sklearn.metrics import classification_report
    model = load_model("./model.h5")
    encoder = prepare_model()
    sents, labels = load_data()
    y_test = labels
    X_test = compute_emb(encoder, '\n'.join(sents), "ar")
    y_pred = model.predict_classes(X_test)
    print(classification_report(y_test, y_pred))

テスト結果

[english]
              precision    recall  f1-score   support

       False       0.85      0.81      0.83       177
        True       0.82      0.86      0.84       182

   micro avg       0.83      0.83      0.83       359
   macro avg       0.83      0.83      0.83       359
weighted avg       0.83      0.83      0.83       359

[arabic]
              precision    recall  f1-score   support

       False       0.72      0.77      0.75       991
        True       0.76      0.70      0.73      1000

   micro avg       0.74      0.74      0.74      1991
   macro avg       0.74      0.74      0.74      1991
weighted avg       0.74      0.74      0.74      1991