ナード戦隊データマン

データサイエンスを用いて悪と戦うぞ

Word2vecを使って映画レビュー評価の予測を行う

CBOWを用いたWord2vecモデルで単語埋め込みを作成し、ロジスティック回帰で分類できるか試します。ここでは、映画レビューが肯定的か否定的かというラベルとレビューのテキストデータを用います。

jupyter notebookで実行

まず、データをダウンロードする関数を作成します。

In[1]:

import os
import requests
import tarfile
def load_movie_data():
    save_folder_name = 'temp'
    pos_file = os.path.join(save_folder_name, 'rt-polaritydata', 'rt-polarity.pos')
    neg_file = os.path.join(save_folder_name, 'rt-polaritydata', 'rt-polarity.neg')

    # Check if files are already downloaded
    if not os.path.exists(os.path.join(save_folder_name, 'rt-polaritydata')):
        movie_data_url = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'

        # Save tar.gz file
        req = requests.get(movie_data_url, stream=True)
        with open('temp_movie_review_temp.tar.gz', 'wb') as f:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    f.flush()
        # Extract tar.gz file into temp folder
        tar = tarfile.open('temp_movie_review_temp.tar.gz', "r:gz")
        tar.extractall(path='temp')
        tar.close()

    pos_data = []
    with open(pos_file, 'r', encoding='latin-1') as f:
        for line in f:
            pos_data.append(line.encode('ascii',errors='ignore').decode())
    f.close()
    pos_data = [x.rstrip() for x in pos_data]

    neg_data = []
    with open(neg_file, 'r', encoding='latin-1') as f:
        for line in f:
            neg_data.append(line.encode('ascii',errors='ignore').decode())
    f.close()
    neg_data = [x.rstrip() for x in neg_data]
    
    texts = pos_data + neg_data
    target = [1]*len(pos_data) + [0]*len(neg_data)
    
    return(texts, target)

texts, targets = load_movie_data()

次に、必要なライブラリをインポートします。

In[2]:

import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import random
import pickle
import string
import collections
import io
from nltk.corpus import stopwords

単語埋め込みを作成するために使ういくつかの関数を定義します。

In[3]:

# Normalize text
def normalize_text(texts, stops):
    # Lower case
    texts = [x.lower() for x in texts]

    # Remove punctuation
    texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]

    # Remove numbers
    texts = [''.join(c for c in x if c not in '0123456789') for x in texts]

    # Remove stopwords
    texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]

    # Trim extra whitespace
    texts = [' '.join(x.split()) for x in texts]
    
    return(texts)

# Build dictionary of words
def build_dictionary(sentences, vocabulary_size):
    # Turn sentences (list of strings) into lists of words
    split_sentences = [s.split() for s in sentences]
    words = [x for sublist in split_sentences for x in sublist]
    
    # Initialize list of [word, word_count] for each word, starting with unknown
    count = [['RARE', -1]]
    
    # Now add most frequent words, limited to the N-most frequent (N=vocabulary size)
    count.extend(collections.Counter(words).most_common(vocabulary_size-1))
    
    # Now create the dictionary
    word_dict = {}
    # For each word, that we want in the dictionary, add it, then make it
    # the value of the prior dictionary length
    for word, word_count in count:
        word_dict[word] = len(word_dict)
    
    return(word_dict)

# Turn text data into lists of integers from dictionary
def text_to_numbers(sentences, word_dict):
    # Initialize the returned data
    data = []
    for sentence in sentences:
        sentence_data = []
        # For each word, either use selected index or rare word index
        for word in sentence.split(' '):
            if word in word_dict:
                word_ix = word_dict[word]
            else:
                word_ix = 0
            sentence_data.append(word_ix)
        data.append(sentence_data)
    return(data)

前処理、単語埋め込みの結合、ロジスティック回帰の定義などをtensorflowで行います。

In[4]:

from sklearn.model_selection import train_test_split

def variable_summaries(var):
  """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
  with tf.name_scope('summaries'):
    mean = tf.reduce_mean(var)
    tf.summary.scalar('mean', mean)
    with tf.name_scope('stddev'):
      stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
    tf.summary.scalar('stddev', stddev)
    tf.summary.scalar('max', tf.reduce_max(var))
    tf.summary.scalar('min', tf.reduce_min(var))
    tf.summary.histogram('histogram', var)


with tf.name_scope("init") as scope:
    embedding_size = 200
    vocabulary_size = 2000
    batch_size = 100
    max_words = 100
    stops = stopwords.words('english')

with tf.name_scope("preprocessing") as scope:
    texts, targets = load_movie_data()
    word_dictionary = build_dictionary(texts, 2000)

    texts = normalize_text(texts, stops)
    targets = [targets[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
    targets = np.array([x for ix, x in enumerate(targets)])

    texts = [x for x in texts if len(x.split()) > 2]
    texts = np.array(text_to_numbers(texts, word_dictionary))
    texts = np.array([x[0:max_words] for x in [y+[0]*max_words for y in texts]])

    X_train, X_test, y_train, y_test = train_test_split(texts, targets)
    
with tf.name_scope("placeholders"):
    x_data = tf.placeholder(shape=[None, max_words], dtype=tf.int32)
    y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32)

with tf.name_scope("embeddings"):
    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, x_data)
    embed_avg = tf.reduce_mean(embed, 1)
    
with tf.name_scope("logistic_regression"):
    with tf.name_scope("weights"):
        A = tf.Variable(tf.random_normal(shape=[embedding_size,1]))
        variable_summaries(A)
    with tf.name_scope("bias"):
        b = tf.Variable(tf.random_normal(shape=[1, 1]))
        variable_summaries(b)
    with tf.name_scope("output"):
        model_output = tf.add(tf.matmul(embed_avg, A), b)
        variable_summaries(model_output)
        
with tf.name_scope("total"):
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=model_output, labels=y_target))
    
with tf.name_scope("train"):
    optimizer = tf.train.AdagradOptimizer(0.005)
    train_step = optimizer.minimize(loss)

with tf.name_scope("score"):
    with tf.name_scope("prediction"):
        prediction = tf.round(tf.sigmoid(model_output))
        predictions_correct = tf.cast(tf.equal(prediction, y_target), tf.float32)
    with tf.name_scope("accuracy"):
        accuracy = tf.reduce_mean(predictions_correct)

tf.summary.scalar('loss', loss)
tf.summary.scalar('accuracy', accuracy)
merged = tf.summary.merge_all()

訓練とログ出力を行います。

In[5]:

with tf.Session() as sess:
    train_writer = tf.summary.FileWriter('logs_2/train', sess.graph)
    test_writer = tf.summary.FileWriter('logs_2/test')
    sess.run(tf.global_variables_initializer())
    
    for i in range(5000):
        index = np.random.choice(X_train.shape[0], size=batch_size)
        X_batch = X_train[index]
        y_batch = np.transpose([y_train[index]])
        summary,_ = sess.run([merged, train_step], feed_dict={x_data: X_batch, y_target: y_batch})
        train_writer.add_summary(summary, i)

        if i % 1000 == 0:
            acc, ls = sess.run([accuracy, loss], feed_dict={x_data: X_batch, y_target: y_batch})
            print("Iter " + str(i) + ", Minibatch Loss= "+
                 "{:.6f}".format(ls) + ", Training Accuracy= " + 
                 "{:.5f}".format(acc))
        if i % 10:
            summary, acc = sess.run([merged, accuracy], feed_dict={x_data: X_test, y_target: np.transpose([y_test])})
            test_writer.add_summary(summary, i)
            
    test_acc = sess.run(accuracy, feed_dict={x_data: X_test, y_target: np.transpose([y_test])})
    print("Test Accuracy:", test_acc)

Out[5]:

Iter 0, Minibatch Loss= 0.859230, Training Accuracy= 0.52000
Iter 1000, Minibatch Loss= 0.698935, Training Accuracy= 0.55000
Iter 2000, Minibatch Loss= 0.688453, Training Accuracy= 0.52000
Iter 3000, Minibatch Loss= 0.684841, Training Accuracy= 0.51000
Iter 4000, Minibatch Loss= 0.698602, Training Accuracy= 0.53000
Test Accuracy: 0.503075

Tensorboardから損失関数と精度の推移、ネットワークを見ます。

Screenshot from 2017-10-15 17-07-42.png Screenshot from 2017-10-15 17-08-00.png Screenshot from 2017-10-15 17-08-33.png

見てみると、ランダム予測器よりもほんのわずかに精度が良い程度なので、あまりよいモデルではないと言えます。ちなみに、sklearnのtfidfを用いた結果は、以下のようになります。

In[6]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, train_test_split

pipe = Pipeline([("vectorizer", TfidfVectorizer(min_df=4)),("clf", LogisticRegression())])
grid = GridSearchCV(pipe, param_grid={"vectorizer__min_df":[4,5,6],"clf__C":[0.001,0.01,0.1,1,10]})
X_train, X_test, y_train, y_test = train_test_split(texts, targets)
grid.fit(X_train, y_train)
grid.best_params_, grid.best_score_

Out[6]:

({'clf__C': 1, 'vectorizer__min_df': 5}, 0.73461730865432717)

scikitを使ったほうが20%以上も精度が高いことがわかります。

参考

https://github.com/nfmcclure/tensorflow_cookbook/tree/master/07_Natural_Language_Processing