Python　AI　画像¶¶

scikit-learnを用いて機械学習を行います。ここでは、Suumoの賃貸情報(那覇市）を用います¶

読み込むCSVファイルを指定してください(http://www15.plala.or.jp/vffuda/suumo_zscraper.html で作成してください)
scikit-learnを使用するには、Anacondaの開発環境パッケージを用いるのが簡単です。scikit-learnが含まれています。なお、Numpy、Scipy、Pandas等もインストールしてください。Anacondaの場合、conda install numpyでインストールできます。

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
import math

from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.optimizers import Adam
#時間計測
import time

Using TensorFlow backend.

# Read csv file 読み込むCSVファイルを指定してください

# suumo = pd.read_csv("suumo_higashiyama.csv", sep = "\t", encoding = 'utf-16')
# names = [] # マンション名
# addresses = [] # 住所
# locations0 = [] # 立地1つ目（最寄駅/徒歩~分）
# route0 = [] # 路線一つ目
# station0 = [] # 最寄り駅一つ目
# walk0 = [] # 駅徒歩1つ目（駅徒歩~分）
# locations1 = [] # 立地2つ目（最寄駅/徒歩~分）
# locations2 = [] # 立地3つ目（最寄駅/徒歩~分）
# ages = [] # 築年数(年)
# heights = [] # 建物高さ(階)
# floors = [] # 部屋のある階 メゾネットタイプは一律０、地下はマイナス
# rent = [] # 賃料(万円)
# admin = [] # 管理費(円）
# others = [] # 敷/礼/保証/敷引,償却
# laouts = [] # 間取り
# areas = [] # 専有面積(m2)
# detail_urls = [] # 詳細URL
# suumo.tail()

# 使うものだけを読み込む
COLUMNS = ('徒歩(分)', '築年数', '建物高(階)', '階層',  '賃料(万円)', '管理費(円)', '間取り', '専有面積(m2)')

#suumo = pd.read_csv("suumo_minato.csv", sep = "\t", encoding = 'utf-16', usecols = COLUMNS)
#suumo = pd.read_csv("suumo_higashiyama.csv", sep = "\t", encoding = 'utf-16', usecols = COLUMNS)
#suumo = pd.read_csv("suumo_kyoto_minami.csv", sep = "\t", encoding = 'utf-16', usecols = COLUMNS)
#suumo = pd.read_csv("suumo_yamashina.csv", sep = "\t", encoding = 'utf-16', usecols = COLUMNS)
#suumo = pd.read_csv("suumo_hakata.csv", sep = "\t", encoding = 'utf-16', usecols = COLUMNS)
suumo = pd.read_csv("suumo_naha.csv", sep = "\t", encoding = 'utf-16', usecols = COLUMNS)

# カラム名変更
suumo = suumo.rename(columns = 
                     {'徒歩(分)': 'walk',
                     '築年数': 'age',
                     '建物高(階)': 'height',
                     '階層': 'floor',
                     '賃料(万円)': 'rent',
                     '管理費(円)': 'admin',
                     '間取り': 'layout',
                     '専有面積(m2)': 'area'})
suumo.tail()

suumo.dtypes

walk        int64
age         int64
height      int64
floor       int64
rent      float64
admin       int64
layout     object
area      float64
dtype: object

#suumo["floor"] = suumo["floor"].astype(object) # 階数をint型からobject型へ
#suumo.dtypes

suumo.rent = suumo.rent + suumo.admin/10000 # 管理費も賃料に含める
suumo.drop("admin", axis=1, inplace=True) # 管理費を分析対象から外す
suumo.tail()

# class distribution
print(suumo.groupby('layout').size())

layout
1DK       146
1K       1316
1LDK      299
1room     287
2DK       162
2K         23
2LDK      263
2LK         1
2SLDK       1
3DK        46
3K          1
3LDK      202
3SLDK       1
4DK         1
4K          1
4LDK       33
5DK         1
5LDK        2
dtype: int64

# 必要なデータだけ抽出
#suumo_df.query('layout in ["1DK", "1K", "1LDK", "1SDK", "1LK", "1SK", "1SLDK", "1room"]', inplace = True)
suumo_df = suumo.query("layout.str.contains('1')", inplace = True)

suumo_df = suumo.dropna() # NaNを取り除く

suumo.query("area <= 100", inplace = True) # 専有面積が100m2以上を除外
#suumo_df[suumo_df["area"] <= 100] # 専有面積が100m2以上を除外 うまくいかない

suumo.query("rent <= 40", inplace = True) # 賃料が40万円以上を除外

print(suumo_df.groupby('layout').size())

layout
1DK       146
1K       1306
1LDK      299
1room     283
dtype: int64

sns.lmplot(suumo.columns[6], suumo.columns[4], suumo, col="layout", line_kws={'color':'red'}) # なぜかカラム指定でないとうまくいかない　area x rent
plt.show()

sns.lmplot(suumo.columns[0], suumo.columns[4], suumo, col="layout", line_kws={'color':'red'}) # なぜかカラム指定でないとうまくいかない　walk x rent
plt.show()
# 駅からの距離でそれほど賃料に差がない？

sns.lmplot(suumo.columns[1], suumo.columns[4], suumo, col="layout", line_kws={'color':'red'}) # なぜかカラム指定でないとうまくいかない　age x rent
plt.show()
# 築年数でそれほど賃料に差がない？

sns.lmplot(suumo.columns[2], suumo.columns[4], suumo, col="layout", line_kws={'color':'red'}) # なぜかカラム指定でないとうまくいかない　height x rent
plt.show()
# マンションの高さでそれほど賃料に差がない？

sns.lmplot(suumo.columns[3], suumo.columns[4], suumo, col="layout", line_kws={'color':'red'}) # なぜかカラム指定でないとうまくいかない　floor x rent
plt.show()
# 階層でそれほど賃料に差がない？

plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
sns.scatterplot(suumo.area, suumo.rent, hue = suumo.layout) # 間取りごとに専有面積x賃料の散布図を作成
plt.subplot(1, 2, 2)
sns.scatterplot(suumo.walk, suumo.rent, hue = suumo.layout) # 間取りごとに徒歩x賃料の散布図を作成
plt.show()

#cmap = sns.cubehelix_palette(dark=.3, light=.8, as_cmap=True)
#cmap = sns.cubehelix_palette(reverse=True)
cmap = "jet" # hsv, winter, spring, summer, Pastel1, Blues, Set1, Set2, Set3, Paired, Set1, tab10, tab20, brg, jet
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
sns.scatterplot(suumo.area, suumo.rent, hue = suumo.floor, size=suumo.floor, palette=cmap) # 階数ごとに専有面積x賃料の散布図を作成
plt.subplot(1, 2, 2)
sns.scatterplot(suumo.walk, suumo.rent, hue = suumo.floor, size=suumo.floor, palette=cmap) # 階数ごとに専有面積x賃料の散布図を作成
plt.show()

# box plot 箱ひげ図(最小値、第1四分位点、中央値、第3四分位点、最大値が表される。丸は外れ値。)
color = dict(boxes='DarkOrange', whiskers='DarkBlue', medians='DarkRed', caps='Gray')
suumo_df.plot(kind='box',subplots=True,layout=(3,2),sharex=False,sharey=False,color=color,sym='ro',figsize = (9,9)) # 軸をそれぞれ描く（共有しない）
plt.show()

# histograms
suumo_df.hist(bins = 20, figsize = (9,9)) # bins defalut 10 縦軸:物件数
plt.show()

sns.pairplot(suumo_df, kind="reg", hue='layout') # kind="reg" 線形回帰

<seaborn.axisgrid.PairGrid at 0x1b0e270de48>

pd.options.display.float_format = '{:.2f}'.format
suumo_df.describe(include='all') # suumoの統計情報表
#suumo.describe().apply("{:.2f}".format) # suumoの統計情報表

# データ間の相関係数を求める
corr_mat = suumo_df.corr(method = "pearson") 
# ヒートマップで可視化
sns.heatmap(corr_mat, vmax=1, vmin=-1, center=0, annot=True, fmt=".1f")
plt.show()

sns.set() # seaborn style change
sns.pairplot(suumo_df, kind="reg", markers='.', plot_kws={'line_kws':{'color':'red'}}) # kind="reg"線形回帰線
plt.show()

# area(専有面積)と賃料のヒストグラムと散布図を表示
sns.pairplot(suumo_df[["area", "rent"]], kind="reg", markers='.', plot_kws={'line_kws':{'color':'red'}})
plt.show()

X = suumo_df.drop("rent", 1) # DataFrameから rent を取り除く
#X = X.drop("floor", 1) # DataFrameから floor を取り除く
X = X.drop("layout", 1) # DataFrameから layout を取り除く

y = suumo_df.rent

X.tail()

y.tail()

2777   4.90
2781   4.20
2782   3.60
2783   3.60
2785   3.40
Name: rent, dtype: float64

# 重回帰分析
linear_regression = LinearRegression()

# 学習
linear_regression.fit(X, y) # model.fit or clf.fit clf(classifier)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

print(linear_regression.coef_)

[-0.0124517  -0.01862777  0.1609941   0.00096836  0.08013277]

coefficient = pd.DataFrame(linear_regression.coef_)

coefficient['coefficient'] = pd.DataFrame(linear_regression.coef_)

# データ分割(訓練、テスト）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1627, 5)
(407, 5)
(1627,)
(407,)

#学習
clf = LinearRegression() # 線形回帰モデルの呼び出し clf = classifier
clf.fit(X_train, y_train) # モデルの訓練

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

print(clf.coef_) # 回帰変数の表示

[-0.01221574 -0.01381935  0.17488382 -0.02163186  0.07595386]

print(clf.intercept_) # 回帰直線の切片

2.880516674152644

print(clf.get_params()) # パラメータの取得

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}

# scatter plot matrix
scatter_matrix(suumo, hist_kwds={'bins': 20}, color='g', figsize = (9,9))
plt.show()

forecast = clf.predict(X_test)
print(forecast) # 予測値の表示

[ 4.8408374   6.33601074  5.52807593  4.90469182  5.3271982   5.70271911
  5.04149828  6.22888956  4.42945848  4.42956226  5.44889524  5.01751612
  4.62260798  5.92987189  6.00521669  5.53707257  4.70689218  5.61021544
  5.03873072  4.44797914  6.4153135   4.91292592  5.43103362  5.04149828
  4.20380994  5.0074465   5.76923572  5.0186341   4.87721616  4.97042392
  5.44440943  5.1790262   3.92245163  5.65330634  6.94195419  5.56679256
  5.6749382   4.81920553  7.03568227  7.0501135   4.80808262  7.68560787
  4.87721616  7.98639472  4.72936478  4.88638996  6.00517811  5.73662376
  4.89884802  6.81860785  5.06903836  5.76169061  8.21799407  6.05963739
  5.70601233  5.12912009  5.77177513  6.31146193  4.87857744  4.1678017
  5.96975188  5.49238404  5.19350785  4.8555843   4.94655746  5.21617954
  5.87061914  5.3190785   6.32878605  5.96195297  6.5617462   5.98609119
  4.71471336  5.64912917  5.90824003  5.13943703  4.6930815   6.11956707
  4.94088036  4.65464187  6.5695457   4.66639448  5.51421487  5.90824003
  6.40147032  4.91322176  5.12864066  5.45080165  5.43966878  5.77815764
  4.71143116  4.71936523  5.08425276  4.64040222  5.06872469  6.9393528
 10.89724933  5.80696523  7.14658648  5.53584673  4.89884802  5.08537694
  6.94195419  4.8555843   6.54762821  5.72826288  5.44480348  6.06707744
  5.67511102  5.61054022  4.81910235  4.14686138  6.98521792  4.68802635
  4.89178508  5.57252268  4.89884802  6.02381372  5.98955684  5.96876576
  5.3843733   6.2507139   5.7634618   5.37331905  7.42441255  5.26778485
  4.96757619  4.33360827  4.8325141   5.20428517  6.47824185  5.29371678
  6.94195419  5.22063241  6.71198278  6.54011434  6.33188151  4.69493375
  4.36329454  5.1457997   5.46417978  4.78855188  5.08537694  5.54960079
  6.24506618  5.11890907  5.02629245  6.54011434  5.27905168  4.85553878
  5.17497473  6.2507139   4.55092376  4.9038055   5.96195297  5.55747859
  5.94028252  6.40033726  5.32805662  4.61624161  4.85963971  5.24723135
  5.12753199  6.94238192  5.65415898  4.73173992  6.53869132  6.71006596
  5.31534865  5.10453901  6.49305293  5.14793553  7.53479807  4.34166268
  4.35387842  5.61004262  5.44111813  4.7793272   6.32514868  4.75099664
  4.55290705  6.55187609  5.21527862  6.01539496  4.73361303  5.14793553
  6.16740433  4.55175941  5.94028252  5.57123265  5.88973538  4.4966783
  5.52315143  5.65627878  6.5986027   5.13304442  4.92387006  4.35524013
  5.4038909   5.28891274  6.98521792  5.86770209  6.3684124   5.12719425
  5.11198841  4.80877158  5.03988015  4.35387842  4.24435571  6.15457785
  4.82819572  6.09174413  4.83647965  6.6042948   5.05476781  5.44480348
  5.82142137  6.07155905  4.71471336  7.0501135   5.18235131  5.21306149
  4.50244048  6.43694536  6.99241854  5.55888636  5.69674288  7.49990656
  7.0501135   4.51490487  6.97078668  6.96358606  6.50861237  4.81832486
  5.72114047  5.26326864  5.11782646  4.81414579  4.66685761  7.04386667
  5.01751612  6.85260442  5.57123265  6.52321694  6.07608428  5.11536307
  5.06903836  4.84653552  5.54514703  5.96612546  5.18071084  5.84751101
  5.71724597  6.9058911   7.52699204  4.8555843   5.09086575  4.47347308
  5.9878203   4.35524013  7.9280845   6.13416814  5.18853664  4.90027026
  5.36692528  6.11351435  4.68802635  6.15884142  4.72303958  5.57911045
  6.11034116  5.95227075  5.16012844  5.40502614  6.04544558  5.6749382
  4.53888896  5.41014032  5.56677889  4.39714214  5.10601832  5.38776989
  6.43445867  6.0887093   4.76262895  5.74206965  3.96670252  6.12334998
  5.51544071  5.96191438  5.4215244   6.08018905  4.36329454  5.82898157
  5.05476781  5.01751612  6.35041791  5.73285286  5.50633706  6.2750501
  4.71010806  4.47602252  4.60889895  5.1661271   4.8663617   4.68514307
  7.07174536  5.36637903  5.7122366   5.14780273  4.67346084  5.1577396
  4.92524024  5.09778369  5.27341437  5.42552276  6.43497813  5.94028252
  6.62592666  6.7807222   6.52281926 10.89724933  7.4782747   6.33176518
  6.13720955  5.56329578  5.14155851  4.44808865  6.42310219  5.18733923
  3.76893718  5.65330634  5.35152263  6.13207058  5.71632528  7.53479807
  6.48990499  6.02684855  5.29114436  5.54514703  4.14761693  5.71422755
  4.84472984  4.82819572  5.56677889  4.97098894  5.69674288  5.54960079
  4.75754618  6.85696835  4.42956226  5.71389538  4.84787643  4.64187935
  4.86995804  6.82605709  6.04542434  4.93393412  6.5695457   5.41014032
  4.87015322  6.00768885  5.98350166  5.58634203  5.18985599  6.13556572
  5.92242428  5.63464691  6.42366848  6.20210514  6.18903619  5.26488855
  6.02720473  4.34166268  4.68134465  5.14793553  4.84699381  5.25476218
  5.13542225  5.35741832  5.54960079  5.59286451  4.58950459  4.47602252
  4.88163785  6.41334627  5.03988015  4.64972819  6.42366848  5.76923572
  4.85995623  5.67493721  4.36329454  5.78471545  7.09337722  5.96876576
  4.39714214  5.16756111  5.6318473   4.41444312  5.65330634  6.47521876
  6.13556572  5.08077164  4.7227314   5.06243678  5.47086633]

print(y_test)

961    3.90
1577   7.10
1543   5.80
1630   5.30
2004   4.70
1881   4.80
1530   5.90
1210   4.30
1989   5.65
739    3.45
1127   4.90
1931   4.90
481    5.10
1708   6.50
1570   6.60
220    6.10
2613   3.60
1447   5.90
1116   5.45
2481   4.00
1786   6.45
460    3.60
2234   4.42
1587   5.80
2698   3.25
2587   4.30
1309   6.30
611    3.48
339    4.70
2310   4.30
       ... 
1999   5.00
590    3.70
1850   4.70
2074   5.75
2082   5.75
50     3.90
1962   5.10
1926   4.70
100    6.15
2211   4.80
2535   4.05
1390   6.00
1315   6.30
2256   6.80
2416   3.90
1071   5.30
4      5.35
886    6.05
1047   4.50
167    5.10
1934   5.63
1440   5.70
2034   5.50
704    4.50
133    6.30
1031   6.40
1644   5.55
1885   4.90
450    4.50
2597   5.00
Name: rent, Length: 407, dtype: float64

# 予測精度の算出
print("train: ", clf.score(X_train, y_train)) # 決定係数の表示
print("test:  ", clf.score(X_test, y_test)) # 決定係数の表示

train:  0.2538071425832332
test:   0.4983269855958463

ERROR = (forecast - y_test) / y_test
print(np.mean(np.abs(ERROR))*100)
print(np.max(np.abs(ERROR))*100)

13.49824280951546
60.71116999542464

plt.hist(ERROR, bins=20, range=None, weights=None,
         cumulative=False, bottom=None, histtype='bar',
         align='mid', orientation='vertical', rwidth=None,
         log=False, color=None, label=None, stacked=False)

(array([ 3.,  3.,  9.,  6., 18., 28., 42., 56., 59., 50., 39., 24., 17.,
        16., 11., 15.,  5.,  2.,  3.,  1.]),
 array([-0.42040015, -0.36902456, -0.31764896, -0.26627337, -0.21489778,
        -0.16352219, -0.11214659, -0.060771  , -0.00939541,  0.04198018,
         0.09335578,  0.14473137,  0.19610696,  0.24748255,  0.29885815,
         0.35023374,  0.40160933,  0.45298492,  0.50436052,  0.55573611,
         0.6071117 ]),
 <a list of 20 Patch objects>)

#sns.set_style("darkgrid")     # 背景暗、グリッドあり。デフォルト。
#sns.set_style("whitegrid")
#sns.set_style("dark")
#sns.set_style("white")
#sns.set_style("ticks")
sns.set_style("darkgrid", {"axes.facecolor": "lightgray"})

#sns.set_context("poster")
#sns.set_context("talk")
#sns.set_context("paper")
#sns.set_context("notebook")
#sns.catplot(data=suumo_df, kind='box')
sns.catplot(data=suumo_df, kind='violin')
plt.show()

	walk	age	height	floor	rent	layout	area
count	2034.00	2034.00	2034.00	2034.00	2034.00	2034	2034.00
unique	nan	nan	nan	nan	nan	4	nan
top	nan	nan	nan	nan	nan	1K	nan
freq	nan	nan	nan	nan	nan	1306	nan
mean	13.54	8.34	5.68	3.58	5.54	NaN	26.86
std	17.95	10.63	2.90	2.10	1.79	NaN	8.01
min	1.00	0.00	2.00	0.00	2.20	NaN	1.00
25%	5.00	0.00	4.00	2.00	4.70	NaN	23.10
50%	8.00	4.00	5.00	3.00	5.35	NaN	25.90
75%	15.00	13.75	7.75	5.00	6.10	NaN	29.60
max	99.00	53.00	34.00	13.00	50.00	NaN	148.00

	walk	age	height	floor	rent	admin	layout	area
2781	3	16	5	5	3.9	3000	1room	23.00
2782	17	37	3	2	3.6	0	1LDK	26.00
2783	17	37	3	2	3.6	0	1LDK	26.00
2784	27	37	3	2	4.3	2000	2DK	38.00
2785	2	29	4	2	3.3	1000	1room	19.81

	walk	age	height	floor	area
2777	5	16	3	2	23.18
2781	3	16	5	5	23.00
2782	17	37	3	2	26.00
2783	17	37	3	2	26.00
2785	2	29	4	2	19.81

Python AI 画像¶¶

scikit-learnを用いて機械学習を行います。ここでは、Suumoの賃貸情報(那覇市）を用います¶

Python　AI　画像¶¶