ナード戦隊データマン

機械学習と自然言語処理についてのブログ

TMD dataset: スマホのセンサー情報で移動手段予測

TMDデータセット1とは、スマホのセンサー情報を用いて移動手段を予測するための無料のデータセットです。

github.com

※ この記事のコードの修正版は上記のgithubページにアップロードしています。修正版の方が簡潔です。

概要

githubにコード2が公開されています。 GitHub - vlomonaco/US-TransportationMode: Transportation Mode Detection with Unconstrained Smartphones Sensors

コードの実行フローは以下です:

  1. 生データのダウンロード。
  2. 生データをウィンドウ分割するなど加工。
  3. 加工したデータを使ってモデリングとテスト。

windowのサイズは5秒となっており、5秒の中から特徴量を抽出し、そのwindowの移動手段を分類します。

特徴量には以下があります:

  • 当該センサーの平均値。
  • 当該センサーの最大値。
  • 当該センサーの最小値。
  • 当該センサーの標準偏差

コード

とはいっても、githubのコードは煩雑すぎるので、もう少しだけ簡単にしてみようと思います。

前処理モジュール(preprocessing.py)

import numpy as np


ACC = "android.sensor.accelerometer"
GYRO = "android.sensor.gyroscope"
SOUND = "sound"


def data_loader(infile, sensor_type):
    with open(infile) as f:
        data = []
        for line in f:
            line = line.strip()
            line = line.split(",")
            if line[1] == sensor_type:
                try:
                    line[0] = int(line[0])
                    if line[0] < 0:
                        line[0] *= -1
                    line[2:] = list(map(float, line[2:]))
                    data.append(line)
                except ValueError:
                    print(infile)
    return data


def window_segmentation(data, window_size=20000):
    first_t = 0.0
    out = []
    row = []
    for d in data:
        if int(d[0]) - first_t < window_size:
            x = np.array(list(map(float, d[2:])))
            row.append(np.sqrt(np.nansum(x**2)))
        else:
            out.append(row)
            row = []
            first_t += window_size
            x = np.array(list(map(float, d[2:])))
            row.append(np.sqrt(np.nansum(x**2)))
    if row:
        out.append(row)
    return out


def feature(window):
    if window:
        return [
            np.nanmean(window),
            np.nanstd(window),
            np.nanmax(window),
            np.nanmin(window)
        ]
    else:
        return [0.0, 0.0, 0.0, 0.0]

データ生成器

import os
import numpy as np
from tqdm import tqdm
from preprocessing import data_loader, window_segmentation, feature, ACC, GYRO, SOUND

root_path = "/root/work/tmd/raw_data/"

TRAIN = [
    "U1", "U2", "U3", "U4", "U5", "U6", "U7", "U8", "U9", "U10", "U11", "U13",
    "U14", "U16", "U12"
]
VALID = ["U15"]
#TEST = ["U12"]

label2id = {"Bus": 0, "Car": 1, "Still": 2, "Train": 3, "Walking": 4}


def data_generate(users, test=False):
    ds = [
        os.path.join(root_path, d) for d in os.listdir(root_path) if d in users
    ]
    X = []
    y = []
    for d in tqdm(ds):
        paths = [
            os.path.join(d, path) for path in os.listdir(d) if ".csv" in path
        ]
        for path in tqdm(paths):
            try:
                acc = data_loader(path, sensor_type=ACC)
                gyro = data_loader(path, sensor_type=GYRO)
                sound = data_loader(path, sensor_type=SOUND)
                acc = window_segmentation(acc)
                gyro = window_segmentation(gyro)
                sound = window_segmentation(sound)
                window_length = min(len(acc), len(gyro))
                window_length = min(len(sound), window_length)
                if window_length == 0:
                    continue
                acc = acc[:window_length]
                gyro = gyro[:window_length]
                sound = sound[:window_length]
            except Exception as e:
                with open("training.log", "a") as f:
                    f.write("file:{}, error: {}".format(path, repr(e)))
                    f.write("\n")
                continue
            labelid = label2id[path.split("_")[3]]
            labels = [labelid for _ in range(window_length)]

            tmp_X = []
            for a, g, s in zip(acc, gyro, sound):
                row = feature(a) + feature(g) + feature(s)
                tmp_X.append(row)
            try:
                assert len(tmp_X) == len(labels)
            except Exception as e:
                with open("training.log", "a") as f:
                    f.write("file:{}, error: {}".format(path, repr(e)))
                    f.write("\n")
                continue
            X += tmp_X
            y += labels

    X = np.array(X)
    y = np.array(y)
    return X, y

訓練・テスト

import pandas as pd
import pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from data_generator import data_generate, TRAIN, VALID

if __name__ == "__main__":
    debug = False

    # parameters for GridSearchCV
    param_grid = {
        "n_estimators": [100],
        "max_depth": [3, 5],
        "min_samples_split": [10, 20],
        "min_samples_leaf": [5, 10, 20],
        "max_leaf_nodes": [20, 40],
        "min_weight_fraction_leaf": [0.1]
    }

    model = RandomForestClassifier(class_weight="balanced")
    grid_search = GridSearchCV(model, param_grid=param_grid)

    if debug:
        X_train, y_train = data_generate("U1")
        X_train = pd.DataFrame(X_train).replace([np.inf, -np.inf, np.nan], 0.0)
        X_train.to_csv("debug.csv")
        model.fit(X_train, y_train)
    else:
        X_train, y_train = data_generate(TRAIN)
        X_test, y_test = data_generate(VALID)
        with open("training.pkl", "wb") as f:
            pickle.dump((X_train, y_train, X_test, y_test), f)
        #with open("training.pkl", "rb") as f:
        #    X_train, y_train, X_test, y_test = pickle.load(f)

        X_train = pd.DataFrame(X_train).replace([np.inf, -np.inf, np.nan], 0.0)
        X_test = pd.DataFrame(X_test).replace([np.inf, -np.inf, np.nan], 0.0)
        grid_search.fit(X_train, y_train)

        model = RandomForestClassifier(class_weight="balanced",
                                       **grid_search.best_params_)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))

精度

              precision    recall  f1-score   support

           0       0.10      0.42      0.16        12
           1       0.73      0.43      0.54        51
           2       1.00      0.57      0.73        94
           3       0.65      0.83      0.73        66
           4       0.92      1.00      0.96        44

    accuracy                           0.67       267
   macro avg       0.68      0.65      0.62       267
weighted avg       0.81      0.67      0.71       267

考察

あまり精度がよくありません。前処理でなにかミスしているかもしれません。彼らのgithubのコードを実行すると80%以上の精度がでるので、前処理段階で何かが違っている可能性が高いです。

もし、自前でアノテーションをしたい場合は、以下でアノテーションツールを公開しています。 http://cs.unibo.it/projects/us-tm2017/tutorial.html

ともあれ、センサーデータから移動手段の予測、という問題に挑戦したい場合は、TMDデータセットは無料で公開されているので興味があれば使ってみては。

あるいは、もっと巨大なデータとして以下があります。

Sussex-Huawei Locomotion Dataset

追記

2019/07/18 8:55

訓練スクリプトを以下のように書き換え(balancedデータにする)たら精度が上がったようです。

import pandas as pd
import pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from data_generator import data_generate, DATA


def prepare_models():
    print("prepareing model")
    param_grid = {
        "n_estimators": [100],
        "max_depth": [3, 5],
        "min_samples_split": [10, 20],
        "min_samples_leaf": [5, 10, 20],
        "max_leaf_nodes": [20, 40],
        "min_weight_fraction_leaf": [0.1]
    }

    model = RandomForestClassifier(class_weight="balanced")
    grid_search = GridSearchCV(model, param_grid=param_grid)
    return model, grid_search


def debug_do(model):
    print("debug do")
    X_train, y_train = data_generate("U1")
    X_train = pd.DataFrame(X_train).replace([np.inf, -np.inf, np.nan], 0.0)
    X_train.to_csv("debug.csv")
    model.fit(X_train, y_train)


def load_data(load=False):
    print("load data")
    labels = [0, 1, 2, 3, 4]
    if load:
        with open("training.pkl", "rb") as f:
            X, y = pickle.load(f)
    else:
        X, y = data_generate(DATA)
        with open("training.pkl", "wb") as f:
            pickle.dump((X, y), f)
    return X, y, labels


def prepare_data(X, y, labels):
    print("prepare data")
    df = pd.DataFrame(X).replace([np.inf, -np.inf, np.nan], 0.0)

    df["label"] = y
    min_nlabel = df.shape[0]

    for label in labels:
        tmp = sum(df["label"] == label)
        if tmp < min_nlabel:
            min_nlabel = tmp

    data = [df[df["label"] == label].iloc[:min_nlabel] for label in labels]
    assert data[0].shape[0] == min_nlabel
    print("min_nlabel:", min_nlabel)

    df = pd.concat(data)
    y = df["label"]
    X = df.drop(columns=["label"])

    X_train, X_test, y_train, y_test = train_test_split(X, y)
    return X_train, X_test, y_train, y_test


if __name__ == "__main__":
    debug = False
    load = False
    model, grid_search = prepare_models()

    if debug:
        debug_do(model)
    else:
        X, y, labels = load_data(load)
        X_train, X_test, y_train, y_test = prepare_data(X, y, labels)
        grid_search.fit(X_train, y_train)
        model = RandomForestClassifier(
            class_weight="balanced",
            **grid_search.best_params_)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.68      0.76      0.72        82
           1       0.83      0.69      0.75        75
           2       0.75      0.93      0.83        57
           3       0.74      0.71      0.72        68
           4       0.99      0.89      0.93        80

    accuracy                           0.79       362
   macro avg       0.80      0.79      0.79       362
weighted avg       0.80      0.79      0.79       362

参考