ナード戦隊データマン

データサイエンスを用いて悪と戦うぞ

TSAのための自動ラベリング

TSAのデータセットにはSentiHoodなどがありますが、データセットのサイズはそれほど大きくありません。今回は、Unsupervised Aspect Term Extraction with B-LSTM & CRF using Automatically Labelled Datasets1 という論文にかかれている自動ラベリング手法を実装します。

コード

モジュール

from nltk.corpus import stopwords
 
_stps = {word: True for word in stopwords.words("english")}
 
def load_lexicon(posfile="../data/lexicon/opinion-lexicon/positive-words.txt",
                 negfile="../data/lexicon/opinion-lexicon/negative-words.txt",
                 pos_label="pos",
                 neg_label="neg"):
    out = {}
    for infile, label in zip([posfile, negfile], [pos_label, neg_label]):
        with open(infile, encoding="latin") as f:
            for line in f:
                if line.startswith(";"):
                    continue
                line = line.strip()
                if line:
                    out[line] = label
    return out
 
 
def tokens2dict(tokens):
    out = {}
    for token in tokens:
        out[token["index"]] = token["word"]
    return out
 
 
def _dep_ex(dep):
    return dep["dep"], dep["governor"], dep["dependent"]
 
 
def _get_sentiment(target,
                   id2word,
                   lexicon,
                   negs,
                   pos_label="pos",
                   neg_label="neg"):
    sentiment = lexicon[id2word[target]]
    if target in negs:
        if sentiment == pos_label:
            return neg_label
        elif sentiment == neg_label:
            return pos_label
    else:
        return sentiment
 
 
def _create_neg_dict(deps):
    negs = {}
    for dep in deps:
        p, g, d = _dep_ex(dep)
        if p == "neg":
            negs[g] = True
    return negs
 
 
def _search_stepone(negs,
                    deps,
                    id2word,
                    lexicon,
                    pos_label="pos",
                    neg_label="neg"):
    out = {}
    stack = []
    for dep in deps:
        p, g, d = _dep_ex(dep)
        if p == "dobj" and id2word[g] in lexicon:
            out[d] = lexicon[id2word[g]]
            stack.append(d)
        if p == "nsubj":
            for dep2 in deps:
                p2, g2, d2 = _dep_ex(dep2)
                if g == g2 and p2 == "cop" and id2word[g2] in lexicon:
                    out[d] = _get_sentiment(g2, id2word, lexicon, negs,
                                            pos_label, neg_label)
                    stack.append(d)
                if g == g2 and p2 in ["advmod", "xcomp"
                                      ] and id2word[d2] in lexicon:
                    out[d] = _get_sentiment(d2, id2word, lexicon, negs,
                                            pos_label, neg_label)
                    stack.append(d)
        if p in ["pojb", "dobj"]:
            for dep2 in deps:
                p2, g2, d2 = _dep_ex(dep2)
                if d == g2 and p2 == "amod" and id2word[d2] in lexicon:
                    out[d] = _get_sentiment(d2, id2word, lexicon, negs,
                                            pos_label, neg_label)
                    stack.append(d)
    return out, stack
 
 
def _search_steptwo(out, stack, deps):
    for i in range(2):
        for dep in deps:
            p, g, d = _dep_ex(dep)
            if p in ["conj"] and d in stack:
                out[g] = out[d]
                stack.append(g)
            if p == "compound" and d in stack:
                out[g] = out[d]
                stack.append(g)
            if p in ["conj"] and g in stack:
                out[d] = out[g]
                stack.append(d)
            if p == "compound" and g in stack:
                out[d] = out[g]
                stack.append(d)
 
    return out, stack
 
 
def rules(deps, id2word, lexicon, pos_label="pos", neg_label="neg"):
    out = {}
    stack = []
    negs = _create_neg_dict(deps)
    out, stack = _search_stepone(negs, deps, id2word, lexicon, pos_label,
                                 neg_label)
    out, stack = _search_steptwo(out, stack, deps)
    return out
 
 
def iob_format_tokens(id2word, labels, stps=_stps):
    out = []
    tokens = sorted(id2word.items(), key=lambda x: int(x[0]))
    prev = None
    for k, token in tokens:
        k = int(k)
        word = id2word[k]
        if word.lower() in stps:
            label = "o"
            prev = None
        elif k in labels:
            if prev == labels[k]:
                label = "i-" + labels[k]
            else:
                label = "b-" + labels[k]
            prev = labels[k]
        else:
            label = "o"
            prev = None
        out.append((word, label))
    return out
 
 
if __name__ == "__main__":
    from pycorenlp import StanfordCoreNLP
    nlp = StanfordCoreNLP("http://localhost:9000")
 
    lexicon = load_lexicon()
 
    ys = [["screen"], ["speakers"], ["touchpad"], ["price"],
          ["Screen", "speakers"], ["wifi", "card"], ["Alice"],
          ["Sheldon", "Cooper", "friends"]]
    sents = [
        "I like the screen", "The internal speakers are amazing",
        "The touchpad works perfectly", "This laptop has great price",
        "Screen and speakers are awful", "The wifi card is not good",
        "Alice is very beautiful",
        "Sheldon Cooper and his friends are very good"
    ]
 
    for sent, y in zip(sents, ys):
        out = nlp.annotate(sent,
                           properties={
                               'annotators': 'ssplit,depparse',
                               'outputFormat': 'json'
                           })
 
        id2word = tokens2dict(out["sentences"][0]["tokens"])
        assert {
            id2word[k]
            for k, v in rules(out["sentences"][0]["basicDependencies"],
                              id2word, lexicon).items()
        } == set(y)

sentiment140に対する例

import sys
import pandas as pd
from tqdm import tqdm
import re
from pycorenlp import StanfordCoreNLP
 
sys.path.append("../module/")
 
from tsa_annotate import load_lexicon, tokens2dict, rules, iob_format_tokens
 
regex1 = re.compile(r"[@#]\S+")
regex2 = re.compile(r"http\S+")
nlp = StanfordCoreNLP("http://localhost:9000")
 
 
def annotate(tweets, outfile="out.txt"):
    lexicon = load_lexicon()
    for tweet in tqdm(tweets):
        tweet = re.sub(regex1, "", tweet)
        tweet = re.sub(regex2, "", tweet)
 
        out = nlp.annotate(tweet,
                           properties={
                               'annotators': 'ssplit,depparse',
                               'outputFormat': 'json'
                           })
        sents = out["sentences"]
        for sent in sents:
            id2word = tokens2dict(sent["tokens"])
            labels = rules(sent["basicDependencies"], id2word, lexicon)
            result = iob_format_tokens(id2word, labels)
            with open(outfile, "a") as f:
                f.write('\n'.join(['\t'.join(list(r)) for r in result]))
                f.write("\n\n")
 
 
if __name__ == "__main__":
    datafile = "../data/sentiment140/training.1600000.processed.noemoticon.csv"
    df = pd.read_csv(datafile, encoding="latin", header=None)
    annotate(df[5])

アノテーションされた文の一つ

I       o
'm      o
guessing        o
Rickys  o
not     o
a       o
fan!Can o
'       o
t       o
read    o
his     o
blog    o
,       o
only    o
got     o
limited o
internet        b-neg
access  i-neg
from    o
my      o
phone   o

説明

corenlp serverを立ち上げ、ssplitとdepparseを使います。depparseによって係り受け構造を取得します。さらに、sentiment lexicon2を使います。係り受け構造とlexiconを利用し、ルールベースでアノテーションします。詳細はコードか論文を読んでください。