Step 1で収集するページのURLを設定し、Step 10で出力するCSVファイル名を指定してください
BeautifulSoupを用いてHTMLからデータを抽出します。HTML の取得は requests を使い、HTML のパース処理を Beautiful Soup で実施しています。
解析する場合は、http://www15.plala.or.jp/vffuda/ML_Suumo.html をご利用ください
参考にしたサイトは、こちらです。
【Python】賃貸情報を調べたかったので、BeautifulSoupでスクレイピングしてみた(https://qiita.com/bottusan1073/items/2093b76ff7734d733879)
機械学習を使って東京23区のお買い得賃貸物件を探してみた 〜スクレイピング編〜(http://www.analyze-world.com/entry/2017/10/09/062445)
from bs4 import BeautifulSoup # Web scraping
import urllib3 # url access
import certifi # 証明書
import re # 正規表現
import pandas as pd # Python data analys library
from pandas import Series, DataFrame
import time
# Step 1 get http page for getting last page
# http = urllib3.PoolManager()
http = urllib3.PoolManager(cert_reqs = 'CERT_REQUIRED',
ca_certs = certifi.where())
# 東京都(13)港区(13103)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=030&bs=040&ta=13&sc=13102&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 京都府(26)京都市北区(26101)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=060&bs=040&ta=26&sc=26101&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 京都府(26)京都市上京区(26102)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=060&bs=040&ta=26&sc=26102&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 京都府(26)京都市左京区(26103)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=060&bs=040&ta=26&sc=26103&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 京都府(26)京都市中京区(26104)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=060&bs=040&ta=26&sc=26104&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 京都府(26)京都市東山区(26105)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=060&bs=040&ta=26&sc=26105&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 京都府(26)京都市下京区(26106)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=060&bs=040&ta=26&sc=26106&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 京都府(26)京都市南区(26107)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=060&bs=040&ta=26&sc=26107&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 京都府(26)京都市右京区(26108)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=060&bs=040&ta=26&sc=26108&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 京都府(26)京都市伏見区(26109)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=060&bs=040&ta=26&sc=26109&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 京都府(26)京都市山科区(26110)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=060&bs=040&ta=26&sc=26110&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 京都府(26)京都市西京区(26111)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=060&bs=040&ta=26&sc=26111&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 福岡県(40)福岡市東区(40131)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=090&bs=040&ta=40&sc=40131&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 福岡県(40)福岡市博多区(40132)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=090&bs=040&ta=40&sc=40132&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 福岡県(40)福岡市中央区(40133)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=090&bs=040&ta=40&sc=40133&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 福岡県(40)福岡市南区(40134)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=090&bs=040&ta=40&sc=40134&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 福岡県(40)福岡市西区(40135)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=090&bs=040&ta=40&sc=40135&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 福岡県(40)福岡市城南区(40136)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=090&bs=040&ta=40&sc=40136&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 福岡県(40)福岡市早良区(40137)
url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=090&bs=040&ta=40&sc=40137&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# 沖縄県(47)那覇市(47201)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=090&bs=040&ta=47&sc=47201&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2="
# 沖縄県(47)那覇市(47201)浦添市(47205)宜野湾市(47208)
#url = "https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=090&bs=040&ta=47&sc=47201&sc=47205&sc=47208&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&srch_navi=1"
# !!! Also change csv file name in step 10
response = http.request('GET', url)
soup = BeautifulSoup(response.data, "html.parser")
# Step 2 retrieve last page number
pages = soup.find_all('ol', class_='pagination-parts')
#print(pages)
pages = str(pages)[::-1] # トータルページが最後のほうにあるので、逆順の文字列に変換
#print(pages)
m = re.search(r'\<\d\d\d\>', pages) # 最初の<ddd>形式(3桁)のトータルページ数を探す
if m is None:
m = re.search(r'\<\d\d\>', pages) # 最初の<dd>形式(二桁)のトータルページ数を探す
if m is None:
m = re.search(r'\<\d\>', pages) # 最初の<d>形式(一桁)のトータルページ数を探す
#print(m.group(0)[::-1])
last_page_number = int(m.group(0).replace("<", "").replace(">", "")[::-1]) # group() マッチした文字列を返す。再度文字列を逆順にする
urls = []
urls.append(url) # page 1
# Step 3 set url name with pages (2 to last page)
for i in range(last_page_number - 1): # page 2 to last page
page_num = str(i + 2)
url_page = url + '&pn=' + page_num
urls.append(url_page)
names = [] # マンション名
addresses = [] # 住所
locations0 = [] # 立地1つ目(最寄駅/徒歩~分)
route0 = [] # 路線一つ目
station0 = [] # 最寄り駅一つ目
walk0 = [] # 徒歩1つ目(駅徒歩~分)
locations1 = [] # 立地2つ目(最寄駅/徒歩~分)
locations2 = [] # 立地3つ目(最寄駅/徒歩~分)
ages = [] # 築年数(年)
heights = [] # 建物高(階)
floors = [] # 部屋のある階 メゾネットタイプは一律0、地下はマイナス
rent = [] # 賃料(万円)
admin = [] # 管理費
others = [] # 敷/礼/保証/敷引,償却
layouts = [] # 間取り
areas = [] # 専有面積(m2)
detail_urls = [] # 詳細URL
# Step 4 get each url page
print("Total pages:", last_page_number)
page = 0
for url in urls:
response = http.request('GET', url)
soup = BeautifulSoup(response.data, "html.parser")
apartments = soup.find_all('div', class_='cassetteitem')
# Step 5 for an apartment
for apartment in apartments:
room_number = len(apartment.find_all('tbody')) # 表示建物数 デフォルト30
name = apartment.find('div', class_='cassetteitem_content-title').text
address = apartment.find('li', class_='cassetteitem_detail-col1').text
for i in range(room_number):
names.append(name)
addresses.append(address)
# Step 6 最寄り駅3か所(4つ目以降は無視)
sublocation = apartment.find('li', class_='cassetteitem_detail-col2')
cols = sublocation.find_all('div')
for i in range(len(cols)):
text = cols[i].find(text=True)
for j in range(room_number):
if i == 0:
locations0.append(text) #立地1つ目(最寄駅/徒歩~分)
_route0 = text.split('/')[0]
route0.append(_route0) # 路線一つ目
_station0 = text.split('/')[1].split(' ')[0] # 最寄り駅一つ目
station0.append(_station0)
if " 歩" in text:
_walk0 = text.split(" 歩")[1]
_walk0 = _walk0.rstrip("分")
walk0.append(_walk0) # 一つ目の徒歩のみ取得
else:
walk0.append('99') # 徒歩表示なし(車~分 等)
elif i == 1:
locations1.append(text)
elif i == 2:
locations2.append(text)
# Step 7 set ages and heights for each apartments
age_and_height = apartment.find('li', class_='cassetteitem_detail-col3')
age = age_and_height('div')[0].text
if age == "新築":
age = "0"
elif "99" in age: # 99年以上
age = "99"
else:
age = age.rstrip("年")
age = age.lstrip("築")
height = age_and_height('div')[1].text
height = height.replace('地下1地上', '') # 地下は無視
height = height.replace('地下2地上', '')
height = height.replace('地下3地上', '')
height = height.replace('地下4地上', '')
height = height.replace('地下5地上', '')
height = height.replace('地下6地上', '')
height = height.replace('地下7地上', '')
height = height.replace('地下8地上', '')
height = height.replace('地下9地上', '')
height = height.replace('平屋', '1')
height = height.replace('階建', '')
for i in range(room_number):
ages.append(age)
heights.append(height)
# Step 8 for a room
table = apartment.find('table')
rows = []
rows.append(table.find_all('tr'))
roomData = [] # 各建物(table)に対して、各部屋(row)を取得
for row in rows:
for tr in row:
cols = tr.find_all('td')
if len(cols) != 0:
_floor = cols[2].text
_floor = re.sub('[\r\n\t]', '', _floor) # 置換(除去)
_floor = _floor.rstrip("階")
if '-' in _floor:
_floor = '0' # メゾネットタイプは一律0に設定
if 'B' in _floor:
_floor = _floor.replace('B', '-') # 地下はマイナス
_rent_cell = cols[3].find('ul').find_all('li')
_rent = _rent_cell[0].find('span').text.rstrip("万円") # ソートしやすいように数値にする
_admin = _rent_cell[1].find('span').text.rstrip("円") # ソートしやすいように数値にする
_admin = _admin.replace('-', '0') # 管理費が記載されていない場合は0円にする
_deposit_cell = cols[4].find('ul').find_all('li')
_deposit = _deposit_cell[0].find('span').text
_reikin = _deposit_cell[1].find('span').text
_others = _deposit + '/' + _reikin
_floor_cell = cols[5].find('ul').find_all('li')
_layouts = _floor_cell[0].find('span').text
if _layouts == "ワンルーム":
_layouts = "1room"
_area = _floor_cell[1].find('span').text.rstrip("m2") # ソートしやすいように数値にする
_detail_url = cols[8].find('a')['href']
_detail_url = 'https://suumo.jp' + _detail_url
text = [_floor, _rent, _admin, _others, _layouts, _area, _detail_url]
roomData.append(text)
for row in roomData:
floors.append(row[0])
rent.append(row[1])
admin.append(row[2])
others.append(row[3])
layouts.append(row[4])
areas.append(row[5])
detail_urls.append(row[6])
time.sleep(1) # 連続でホームページにアクセスしないようにする 私の計算機だと処理が遅いので、1秒程度に設定
page += 1
print(page, ' ', end = '') # 進捗表示
# Step 9
#各リストをシリーズ化
names = Series(names)
addresses = Series(addresses)
locations0 = Series(locations0)
route0 = Series(route0)
station0 = Series(station0)
walk0 = Series(walk0)
locations1 = Series(locations1)
locations2 = Series(locations2)
ages = Series(ages)
heights = Series(heights)
floors = Series(floors)
rent = Series(rent)
admin = Series(admin)
others = Series(others)
layouts = Series(layouts)
areas = Series(areas)
detail_urls = Series(detail_urls)
suumo_df = pd.concat([names, addresses, locations0, route0, station0, walk0, locations1, locations2, ages, heights, floors,
rent, admin, others, layouts, areas, detail_urls], axis=1)
suumo_df.columns=['マンション名','住所','立地1','路線1','駅1','徒歩(分)','立地2','立地3','築年数','建物高(階)','階層',
'賃料(万円)','管理費(円)','敷/礼/保証/敷引,償却','間取り','専有面積(m2)', '詳細URL']
suumo_df.tail(1)
# Step 10 CSVファイル出力 (separator: Tab)
#suumo_df.to_csv('suumo_minato.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_shinyoko.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_k_kita.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_k_kamikyo.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_k_sakyo.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_k_chukyo.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_k_higashiyama.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_k_shimokyo.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_k_minami.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_k_ukyo.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_k_fushimi.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_k_yamashina.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_k_nishikyo.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_f_higashi.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_f_hakata.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_f_chuo.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_f_minami.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_f_nishi.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_f_jonan.csv', sep = '\t', encoding = 'utf-16')
suumo_df.to_csv('suumo_f_sawara.csv', sep = '\t', encoding = 'utf-16')
#suumo_df.to_csv('suumo_naha.csv', sep = '\t', encoding = 'utf-16')