ナード戦隊データマン

機械学習と自然言語処理についてのブログ

bahuleyan-vechtomova2017semevalの検証

semeval2017 task8は、ツイートに対するstance detectionのタスクの一つです。

検証対象

論文

UWaterloo at SemEval-2017 Task 8: Detecting Stance towards Rumours with Topic Independent Features https://aclweb.org/anthology/S17-2080

概要

[特徴量]
cue_word 特徴量
ツイート内の単語数
大文字数
?,!,.の数
文字数
VADER感情分析スコア
ソースツイートと返信ツイートのコサイン類似度
ハッシュタグ数
@user の数
POSタグのカウント

[機械学習アルゴリズムとパラメータ]
XGBoost
n estimators = 100
max depth = 9
sub sample = 0.8

事前準備

以下からデータをダウンロードします: http://alt.qcri.org/semeval2017/task8/index.php?id=data-and-tools

コード

前処理

idとデータパスの対応付け

import os
import json
from tqdm import tqdm
 
 
def id2path(
        rootpath="../data/train/semeval2017-task8-dataset/rumoureval-data"):
    pathdict = {}
    source2id = {}
    for path1 in tqdm(os.listdir(rootpath)):
        dir1 = os.path.join(rootpath, path1)
        for path2 in os.listdir(dir1):
            dir2 = os.path.join(dir1, path2)
            pathdict[int(path2)] = os.path.join(
                dir2, "source-tweet/" + path2 + ".json")
            for path3 in os.listdir(os.path.join(dir2, "replies")):
                path3_rp = path3.replace(".json", "")
                if int(path2) not in source2id:
                    source2id[int(path2)] = []
                source2id[int(path2)].append(int(path3_rp))
                pathdict[int(path3_rp)] = os.path.join(
                    dir2, "replies/{}".format(path3))
    return {"pathdict": pathdict, "source2id": source2id}
 
 
def id2path_test(rootpath="../data/test/semeval2017-task8-test-data/"):
    pathdict = {}
    source2id = {}
    for path1 in tqdm(os.listdir(rootpath)):
        dir1 = os.path.join(rootpath, path1)
        pathdict[int(path1)] = os.path.join(dir1,
                                            "source-tweet/" + path1 + ".json")
        for path2 in os.listdir(os.path.join(dir1, "replies")):
            path2_rp = path2.replace(".json", "")
            if int(path1) not in source2id:
                source2id[int(path1)] = []
            source2id[int(path1)].append(int(path2_rp))
            pathdict[int(path2_rp)] = os.path.join(dir1,
                                                   "replies/{}".format(path2))
    return {"pathdict": pathdict, "source2id": source2id}
 
 
if __name__ == "__main__":
    data = id2path()
    with open("id2path_train.json", "w") as f:
        json.dump(data, f, indent=4)
 
    for k, path in data["pathdict"].items():
        assert os.path.isfile(path)
 
    data = id2path_test()
    with open("id2path_test.json", "w") as f:
        json.dump(data, f, indent=4)
 
    for k, path in data["pathdict"].items():
        assert os.path.isfile(path)

データの展開

import json
from tqdm import tqdm
 
 
def data_generator(datafile="./id2path_train.json"):
    out = {}
    with open(datafile) as f:
        data = json.load(f)
 
    pathdata = data["pathdict"]
    source2id = data["source2id"]
 
    for sid, childs in tqdm(source2id.items()):
        sid = str(sid)
        with open(pathdata[sid]) as f:
            stext = json.load(f)['text']
 
        out[sid] = {"stext": stext, "childs": {}}
        for cid in childs:
            cid = str(cid)
            with open(pathdata[cid]) as f:
                ctext = json.load(f)['text']
            out[sid]["childs"][cid] = ctext
    return out
 
 
if __name__ == "__main__":
    data = data_generator(datafile="./id2path_train.json")
    with open("fixed_data_train.json", "w") as f:
        json.dump(data, f)
 
    data = data_generator(datafile="./id2path_test.json")
    with open("fixed_data_test.json", "w") as f:
        json.dump(data, f)

展開したデータの整形

import json
import pandas as pd
from tqdm import tqdm
 
 
def id2source(data):
    out = {}
    for sid, v in tqdm(data.items()):
        sid = str(sid)
        for cid, _ in v["childs"].items():
            cid = str(cid)
            out[cid] = sid
    return out
 
 
def extract(labels, data, c2s):
    out = []
    for cid, label in tqdm(labels.items()):
        try:
            sid = c2s[cid]
            stext = data[sid]["stext"]
            ctext = data[sid]["childs"][cid]
            out.append({"stext": stext, "ctext": ctext, "label": label})
        except KeyError:
            print(cid)
    return pd.DataFrame(out)
 
 
def load(
        labelfile="../data/train/semeval2017-task8-dataset/traindev/rumoureval-subtaskA-train.json",
        datafile="./fixed_data_train.json"):
    with open(labelfile) as f:
        labels = json.load(f)
 
    with open(datafile) as f:
        data = json.load(f)
    return labels, data
 
 
if __name__ == "__main__":
    labels, data = load()
    c2s = id2source(data)
    df = extract(labels, data, c2s)
    df.to_csv("train_data.csv", index=False)
 
    labels, data = load(labelfile="../data/test/subtaska.json",
                        datafile="./fixed_data_test.json")
    c2s = id2source(data)
    df = extract(labels, data, c2s)
    df.to_csv("test_data.csv", index=False)

訓練

特徴量設計モジュール

from nltk import word_tokenize, pos_tag
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import copy
import numpy as np
import re
import math
 
tags = """
CC CD DT EX FW IN JJ JJR JJS LS MD
NN NNS NNP NNPS PDT POS PRP PRP$ RB RBR RBS RP
TO UH VB VBD VBG VBN VBP VBZ WDT WP WP$ WRB""".split()
tags = {x: 0 for x in tags}
url_regex = re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
quote_regex = re.compile(r'RT.+')
 
 
with open("./cue_word_list.txt") as f:
    cue_words = {line.strip(): 0 for line in f}
 
 
def mean_vect(words, kv):
    tmp = []
    for word in words:
        try:
            tmp.append(kv[word])
        except KeyError as e:
            continue
    if tmp:
        return np.mean(tmp, axis=0)
    else:
        return np.zeros(200)
 
 
def cossim(x, y):
    cs = np.dot(x, y) / np.sqrt(np.dot(x, x) * np.dot(y, y))
    if math.isnan(cs):
        return 0.0
    else:
        return cs
 
 
def feature_extract_tweet(tweet, source, kv, analyzer):
    tweet = str(tweet)
    source = str(source)
    tweet = re.sub(url_regex, " ", tweet)
    tweet = re.sub(quote_regex, " ", tweet)
    source = re.sub(url_regex, " ", source)
    source = re.sub(quote_regex, " ", source)
 
    cues = copy.deepcopy(cue_words)
    pos_tags = copy.deepcopy(tags)
    words = word_tokenize(tweet)
    words_source = word_tokenize(source)
    pos = pos_tag(words)
    for _, p in pos:
        try:
            pos_tags[p] += 1
        except KeyError:
            continue
 
    pos_tags = [x for _, x in pos_tags.items()]
 
    for word in words:
        if word in cues:
            cues[word] += 1
 
    cues = [x for _, x in cues.items()]
 
    #print(cues)
    
    counts = []
    symbols = ["!", ".", "?", "@", "#"]
    for symbol in symbols:
        counts.append(tweet.count(symbol))
 
    len_upper = sum(1 for c in tweet if c.isupper())
    len_words = len(words)
    len_chars = len(list(tweet))
    len_f = [len_upper, len_words, len_chars]
 
    reply_vect = mean_vect(words, kv)
    source_vect = mean_vect(words_source, kv)
    cs = cossim(reply_vect, source_vect)
    polarity = [x for _, x in analyzer.polarity_scores(tweet).items()]
    special = polarity + [cs]
    row = np.array(counts + len_f + pos_tags + cues + special)
    if np.any(np.isnan(row)) or np.any(np.isinf(row)):
        print(row)
        exit()
    return row

xgboostを訓練

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import pickle
from feature_extraction import feature_extract_tweet
from gensim.models import KeyedVectors
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tqdm
 
 
def load(datafile="../data_fixer/train_data.csv",
         kvfile="../word2vec_model/word2vec.model"):
    df = pd.read_csv(datafile)
    labels = {
        x: i
        for i, x in enumerate(np.unique(df["label"].tolist()))
        if str(x) != "nan"
    }
    print(labels)
    stexts = []
    ctexts = []
    y = []
    for i, d in df.iterrows():
        if str(d["label"]) == "nan":
            continue
        stexts.append(d["stext"])
        ctexts.append(d["ctext"])
        y.append(labels[d["label"]])
    kv = KeyedVectors.load(kvfile, mmap="r")
    analyzer = SentimentIntensityAnalyzer()
    return ctexts, stexts, y, labels, kv, analyzer
 
 
def extract_all(ctexts, stexts, kv, analyzer):
    X = []
    for ctext, stext in tqdm(zip(ctexts, stexts)):
        X.append(feature_extract_tweet(ctext, stext, kv, analyzer))
    return np.array(X)
 
 
def train(X, y):
    clf = XGBClassifier(n_estimators=100, max_depth=9, subsample=0.8)
    #clf = LogisticRegression()
    #clf = RandomForestClassifier()
    clf.fit(X, y)
    return clf
 
 
def save(clf, labels, outfile="model.pkl"):
    with open(outfile, "wb") as f:
        pickle.dump((clf, labels), f)
    return True
 
 
if __name__ == "__main__":
    ctexts, stexts, y, labels, kv, analyzer = load()
    X = extract_all(ctexts, stexts, kv, analyzer)
    clf = train(X, y)
    save(clf, labels)

テスト

コード

from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import pickle
from feature_extraction import feature_extract_tweet
from gensim.models import KeyedVectors
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm import tqdm
 
 
def load(datafile="../data_fixer/test_data.csv",
         kvfile="../word2vec_model/word2vec.model",
         modelfile="./model.pkl"
):
    df = pd.read_csv(datafile)
    with open(modelfile, "rb") as f:
        clf, labels = pickle.load(f)
    stexts = df["stext"]
    ctexts = df["ctext"]
    y = [labels[str(x)] for x in df["label"]]
    kv = KeyedVectors.load(kvfile, mmap="r")
    analyzer = SentimentIntensityAnalyzer()
    return clf, ctexts, stexts, y, labels, kv, analyzer
 
 
def extract_all(ctexts, stexts, kv, analyzer):
    X = []
    for ctext, stext in tqdm(zip(ctexts, stexts)):
        X.append(feature_extract_tweet(ctext, stext, kv, analyzer))
    return np.array(X)
 
 
if __name__ == "__main__":
    from sklearn.metrics import classification_report, accuracy_score
    clf, ctexts, stexts, y, labels, kv, analyzer = load()
    X = extract_all(ctexts, stexts, kv, analyzer)
    y_preds = clf.predict(X)
    print(accuracy_score(y, y_preds))
    print(classification_report(y, y_preds))

結果

0.7561214495592556
              precision    recall  f1-score   support

           0       0.79      0.94      0.86       778
           1       0.00      0.00      0.00        69
           3       0.60      0.31      0.41       106
           4       0.15      0.06      0.08        68

   micro avg       0.76      0.76      0.76      1021
   macro avg       0.38      0.33      0.34      1021
weighted avg       0.67      0.76      0.70      1021

考察

全然ダメです。

実は、前処理段階で、sourceノードのツイートを除外してしまっているので、この部分が悪影響した可能性はあります。

stance detectionのタスク自体は、実はこんなショボいモデルで解けるほど甘くはないようなので、論文の精度も大したことはないようです。

stance detectionの大きな課題は、世界知識を必要とするということです。例えば、次の状況を考えてみます。

議題: TPPは善か?

主張1: TPPは善である。(PRO)
主張2: 政治家は比較優位を理解すべき。 (PRO)
主張3: 高価格な国内生産品は淘汰される。(CON)

主張1については、世界知識が必要ありませんが、主張2と主張3は世界知識が必要です。なぜなら、世界知識がなければ、「比較優位を理解すべき」とか「国内生産品が淘汰される」といった文が賛成を表すのか、反対を表すのかがわからないからです。

したがって、「文の類似度」とか「BoW」とか使っても、すべての世界知識を取り込むには限界があるようです。

詳しい話は以下のスライドが参考になります:

https://www.slideshare.net/naoakiokazaki/ss-100603788

参考

  1. https://aclweb.org/anthology/S17-2080
  2. https://www.slideshare.net/naoakiokazaki/ss-100603788
  3. https://github.com/HareeshBahuleyan/rumour-eval
  4. http://alt.qcri.org/semeval2017/task8/index.php?id=data-and-tools