Python　AI　画像¶

scikit-learnを用いて機械学習を行います。ここでは、サンプルデータの生成を行ったり、有名な糖尿病データセットのPima Indians発症のデータを用いますまた、モデルの保存、再読み込みも行います scikit-learnを使用するには、Anacondaの開発環境パッケージを用いるのが簡単です。scikit-learnが含まれています。なお、Numpy、Scipy、Pandas等もインストールしてください。Anacondaの場合、`conda install numpy`等でインストールできます。¶

# example of training a final classification model
from sklearn.linear_model import LogisticRegression
from sklearn.datasets.samples_generator import make_blobs
from matplotlib import pyplot as plt
import numpy as np

100個のデータサンプルをランダムで生成します。ランダムなデータのうち中心点(集中している部分）を二か所設けます。

# generate 2d classification dataset
X, y = make_blobs(n_samples = 100,  #サンプル数(default=100)
                  centers = 2, # 中心の個数(塊(ブロブ）の数default=none）
                  n_features = 2, # 特徴量の数(default=2)
                  random_state = 1) # default=none
print(X[0:3]); print(y) # n_samplesの最初の3行、 n_features(100件）を出力
plt.scatter(X[:, 0], X[:, 1], marker = 'o', c = 'blue', s = 25, edgecolor = 'k') # Xの散布図を表示
plt.show()

[[-0.79415228  2.10495117]
 [-9.15155186 -4.81286449]
 [-3.10367371  3.90202401]]
[0 1 0 0 1 1 1 1 0 0 1 0 1 0 1 1 1 0 0 1 1 1 1 0 0 1 1 0 0 1 0 0 1 0 1 1 0
 1 1 1 1 1 0 1 0 1 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 1 1 1 1 0 1 0 0 1 1 1 0
 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 1 0 1 0]

ロジスティック回帰モデル学習する(l-bfgs（bryoyden-Fletcher-Goldfarb-Shanno)、準ニュートン法

# fit final model
model = LogisticRegression(solver = 'lbfgs')
model.fit(X, y) # 与えられた入力でモデルを訓練する(学習）
print('score:', model.score(X, y))
print('predict:', model.predict(X)) # 新データに対する予測
coef = model.coef_[0]
intercept = model.intercept_
print('intercept: ', intercept)
print('coef:', coef)
line = np.linspace(-12, 0) # 数列の始点、終点
plt.plot(line, -(line * coef[0] + intercept) / coef[1], c = 'r', label = "LogisticRegression") # 境界線
plt.scatter(X[:, 0], X[:, 1], marker = 'o', c = 'blue', s = 25, edgecolor = 'k') # Xの散布図を表示
plt.show()

score: 1.0
predict: [0 1 0 0 1 1 1 1 0 0 1 0 1 0 1 1 1 0 0 1 1 1 1 0 0 1 1 0 0 1 0 0 1 0 1 1 0
 1 1 1 1 1 0 1 0 1 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 1 1 1 1 0 1 0 0 1 1 1 0
 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 1 0 1 0]
intercept:  [-4.19693856]
coef: [-0.79590021 -0.79078463]

ここからのデータは、アリゾナ州に住むPima Indiansの糖尿病発症データセットのデータを用います。オブジェクトを直列化することでオブジェクトの状態を保存することができるpickleを用いてモデルを保存します。

import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
import pickle
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', # Number of times pregnant 妊娠回数
         'plas', # Plasma glucose concentration 血糖濃度
         'pres', # Diastolic blood pressure 最低血圧
         'skin', # Triceps skin fold thickness 上腕三頭筋皮下脂肪の厚さ
         'test', # serum insulin 血清インスリン濃度
         'mass', # Body mass index BMI
         'pedi', # Diabetes pedigree function 糖尿病血統要因
         'age', # Age 年齢
         'class'] # Class variable (0 or 1) 糖尿病が陽性ならば 1
dataframe = pd.read_csv(url, names = names) # pandasを用いたCSVの読み込み
array = dataframe.values
X = array[:, 0:8] # 説明変数
Y = array[:, 8] # 目的変数
print('X shape: {}, y shape {}'.format(X.shape, y.shape))
#dataframe.head()
dataframe.tail()

X shape: (768, 8), y shape (100,)

# 訓練用データと試験用データに分割する
test_size = 0.33 # 試験用データは、1/3
seed = 7 # 乱数のシード
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
# Fit the model on 33%
model = LogisticRegression(solver = 'lbfgs', max_iter = 150)
model.fit(X_train, Y_train)
print('Train score: {:.3f}'.format(model.score(X_train, Y_train))) # 小数点以下3桁表示
print('Test  score: {:.3f}'.format(model.score(X_test, Y_test)))
scatter_matrix(dataframe, hist_kwds = {'bins': 20}, color = 'g', figsize = (9,9))
plt.show()

Train score: 0.776
Test  score: 0.787

モデルを保存し、再度読み込む

# pickleでモデルを保存し、再度読み込む（joblib is deprecated）
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb')) # pickleでオブジェクトを保存

# some time later...

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb')) # pickleでオブジェクトを読み込み

	preg	plas	pres	skin	test	mass	pedi	age	class
763	10	101	76	48	180	32.9	0.171	63	0
764	2	122	70	27	0	36.8	0.340	27	0
765	5	121	72	23	112	26.2	0.245	30	0
766	1	126	60	0	0	30.1	0.349	47	1
767	1	93	70	31	0	30.4	0.315	23	0

Python AI 画像¶

Python　AI　画像¶