Upload 链家房源信息抓取+可视化.py
Browse files- 链家房源信息抓取+可视化.py +376 -0
链家房源信息抓取+可视化.py
ADDED
@@ -0,0 +1,376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# <h1>Table of Contents<span class="tocSkip"></span></h1>
|
5 |
+
# <div class="toc"><ul class="toc-item"><li><span><a href="#前言" data-toc-modified-id="前言-1"><span class="toc-item-num">1 </span>前言</a></span></li><li><span><a href="#爬虫" data-toc-modified-id="爬虫-2"><span class="toc-item-num">2 </span>爬虫</a></span></li><li><span><a href="#数据可视化" data-toc-modified-id="数据可视化-3"><span class="toc-item-num">3 </span>数据可视化</a></span><ul class="toc-item"><li><span><a href="#房源面积-总价散点图" data-toc-modified-id="房源面积-总价散点图-3.1"><span class="toc-item-num">3.1 </span>房源面积-总价散点图</a></span></li><li><span><a href="#各行政区均价" data-toc-modified-id="各行政区均价-3.2"><span class="toc-item-num">3.2 </span>各行政区均价</a></span></li><li><span><a href="#均价最高的10个小区" data-toc-modified-id="均价最高的10个小区-3.3"><span class="toc-item-num">3.3 </span>均价最高的10个小区</a></span></li><li><span><a href="#均价最高的10个地段" data-toc-modified-id="均价最高的10个地段-3.4"><span class="toc-item-num">3.4 </span>均价最高的10个地段</a></span></li><li><span><a href="#户型分布" data-toc-modified-id="户型分布-3.5"><span class="toc-item-num">3.5 </span>户型分布</a></span></li><li><span><a href="#词云图" data-toc-modified-id="词云图-3.6"><span class="toc-item-num">3.6 </span>词云图</a></span></li></ul></li></ul></div>
|
6 |
+
|
7 |
+
#
|
8 |
+
# ### 前言
|
9 |
+
# **本项目总共分为两个部分:**
|
10 |
+
# * 爬虫:累计爬取链家深圳二手房源信息累计18906条 shenzhen.csv;
|
11 |
+
#
|
12 |
+
# * 数据可视化
|
13 |
+
|
14 |
+
# ### 爬虫
|
15 |
+
# * **爬取各个行政区房源信息;**
|
16 |
+
# * **数据保存为DataFrame;**
|
17 |
+
|
18 |
+
# In[2]:
|
19 |
+
|
20 |
+
|
21 |
+
from bs4 import BeautifulSoup
|
22 |
+
import pandas as pd
|
23 |
+
from tqdm import tqdm
|
24 |
+
import math
|
25 |
+
import requests
|
26 |
+
import lxml
|
27 |
+
import re
|
28 |
+
import time
|
29 |
+
|
30 |
+
|
31 |
+
# In[44]:
|
32 |
+
|
33 |
+
|
34 |
+
area_dic = {'罗湖区':'luohuqu',
|
35 |
+
'福田区':'futianqu',
|
36 |
+
'南山区':'nanshanqu',
|
37 |
+
'盐田区':'yantianqu',
|
38 |
+
'宝安区':'baoanqu',
|
39 |
+
'龙岗区':'longgangqu',
|
40 |
+
'龙华区':'longhuaqu',
|
41 |
+
'坪山区':'pingshanqu'}
|
42 |
+
area_dic = {'罗湖区':'luohuqu'}
|
43 |
+
|
44 |
+
# 加个header以示尊敬
|
45 |
+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
|
46 |
+
'Referer': 'https://sz.lianjia.com/ershoufang/'}
|
47 |
+
|
48 |
+
# 新建一个会话
|
49 |
+
sess = requests.session()
|
50 |
+
sess.get('https://sz.lianjia.com/ershoufang/', headers=headers)
|
51 |
+
|
52 |
+
# url示例:https://sz.lianjia.com/ershoufang/luohuqu/pg2/
|
53 |
+
url = 'https://sz.lianjia.com/ershoufang/{}/pg{}/'
|
54 |
+
|
55 |
+
|
56 |
+
# In[45]:
|
57 |
+
|
58 |
+
|
59 |
+
# 当正则表达式匹配失败时,返回默认值(errif)
|
60 |
+
def re_match(re_pattern, string, errif=None):
|
61 |
+
try:
|
62 |
+
return re.findall(re_pattern, string)[0].strip()
|
63 |
+
except IndexError:
|
64 |
+
return errif
|
65 |
+
|
66 |
+
|
67 |
+
# In[46]:
|
68 |
+
|
69 |
+
|
70 |
+
import time
|
71 |
+
|
72 |
+
#新建一个DataFrame存储信息
|
73 |
+
data = pd.DataFrame()
|
74 |
+
|
75 |
+
for key_, value_ in area_dic.items():
|
76 |
+
# 获取该行政区下房源记录数
|
77 |
+
start_url = 'https://sz.lianjia.com/ershoufang/{}/'.format(value_)
|
78 |
+
html = sess.get(start_url).text
|
79 |
+
house_num = re.findall('共找到<span> (.*?) </span>套.*二手房', html)[0].strip()
|
80 |
+
print('{}: 二手房源共计「{}」套'.format(key_, house_num))
|
81 |
+
time.sleep(1)
|
82 |
+
# 页面限制 每个行政区只能获取最多100页共计3000条房源信息
|
83 |
+
total_page = int(math.ceil(min(3000, int(house_num)) / 30.0))
|
84 |
+
|
85 |
+
for i in tqdm(range(total_page), desc=key_):
|
86 |
+
#print('开始抓取',url.format(value_, i+1))
|
87 |
+
html = sess.get(url.format(value_, i+1)).text
|
88 |
+
soup = BeautifulSoup(html, 'lxml')
|
89 |
+
info_collect = soup.find_all(class_="info clear")
|
90 |
+
|
91 |
+
for info in info_collect:
|
92 |
+
info_dic = {}
|
93 |
+
# 行政区
|
94 |
+
info_dic['area'] = key_
|
95 |
+
# 房源的标题
|
96 |
+
info_dic['title'] = re_match('target="_blank">(.*?)</a><!--', str(info))
|
97 |
+
# 小区名
|
98 |
+
info_dic['community'] = re_match('xiaoqu.*?target="_blank">(.*?)</a>', str(info))
|
99 |
+
# 位置
|
100 |
+
info_dic['position'] = re_match('<a href.*?target="_blank">(.*?)</a>.*?class="address">', str(info))
|
101 |
+
# 税相关,如房本满5年
|
102 |
+
info_dic['tax'] = re_match('class="taxfree">(.*?)</span>', str(info))
|
103 |
+
try:
|
104 |
+
# 总价
|
105 |
+
info_dic['total_price'] = float(re_match('<span class="">(.*?)</span><i>万', str(info)))
|
106 |
+
except:
|
107 |
+
info_dic['total_price'] =None
|
108 |
+
# 单价
|
109 |
+
try:
|
110 |
+
info_dic['unit_price'] = float(re_match('<span>(.*?)元/平</span>', str(info)).replace(',',''))
|
111 |
+
except:
|
112 |
+
info_dic['unit_price'] =None
|
113 |
+
|
114 |
+
# 匹配房源标签信息,通过|切割
|
115 |
+
# 包括面积,朝向,装修等信息
|
116 |
+
icons = re.findall('class="houseIcon"></span>(.*?)</div>', str(info))[0].strip().split('|')
|
117 |
+
info_dic['hourseType'] = icons[0].strip()
|
118 |
+
info_dic['hourseSize'] = float(icons[1].replace('平米', ''))
|
119 |
+
info_dic['direction'] = icons[2].strip()
|
120 |
+
info_dic['fitment'] = icons[3].strip()
|
121 |
+
|
122 |
+
# 存入DataFrame
|
123 |
+
if data.empty:
|
124 |
+
data = pd.DataFrame(info_dic,index=[0])
|
125 |
+
else:
|
126 |
+
data1 = pd.DataFrame(info_dic,index=[0])
|
127 |
+
data = pd.concat([data,data1],ignore_index=True)
|
128 |
+
|
129 |
+
|
130 |
+
|
131 |
+
|
132 |
+
# In[47]:
|
133 |
+
|
134 |
+
|
135 |
+
# 去掉一条面积10000+平米的房源记录
|
136 |
+
data = data[data['hourseSize'] < 10000]
|
137 |
+
data.head()
|
138 |
+
|
139 |
+
|
140 |
+
# In[14]:
|
141 |
+
|
142 |
+
|
143 |
+
# 保存数据
|
144 |
+
# data.to_csv('./shenzhen2.csv')
|
145 |
+
|
146 |
+
|
147 |
+
# ### 数据可视化
|
148 |
+
# * 房源面积-总价散点图
|
149 |
+
# * 各行政区均价
|
150 |
+
# * 均价最贵的10个地段
|
151 |
+
# * 均价最贵的10个小区
|
152 |
+
# * 户型分布
|
153 |
+
# * 标题文本词云图
|
154 |
+
|
155 |
+
# In[48]:
|
156 |
+
|
157 |
+
|
158 |
+
from pyecharts.charts import *
|
159 |
+
from pyecharts import options as opts
|
160 |
+
from pyecharts.commons.utils import JsCode
|
161 |
+
from jieba import posseg as psg
|
162 |
+
import collections
|
163 |
+
import pandas as pd
|
164 |
+
|
165 |
+
|
166 |
+
# In[51]:
|
167 |
+
|
168 |
+
|
169 |
+
data = pd.read_csv('./shenzhen.csv')
|
170 |
+
data = data.drop(columns=['Unnamed: 0'])
|
171 |
+
data.head()
|
172 |
+
|
173 |
+
|
174 |
+
# #### 房源面积-总价散点图
|
175 |
+
# * 💥最贵的一套房源是位于宝安-曦城的房源,总价8800W;
|
176 |
+
|
177 |
+
# In[52]:
|
178 |
+
|
179 |
+
|
180 |
+
scatter = (Scatter(init_opts=opts.InitOpts(theme='dark'))
|
181 |
+
.add_xaxis(data['hourseSize'])
|
182 |
+
.add_yaxis("房价", data['total_price'])
|
183 |
+
.set_series_opts(label_opts=opts.LabelOpts(is_show=False),
|
184 |
+
markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max", name="最大值"),]))
|
185 |
+
.set_global_opts(
|
186 |
+
legend_opts=opts.LegendOpts(is_show=False),
|
187 |
+
title_opts=opts.TitleOpts(title="深圳二手房 总价-面积 散点图"),
|
188 |
+
xaxis_opts=opts.AxisOpts(
|
189 |
+
name='面积',
|
190 |
+
# 设置坐标轴为数值类型
|
191 |
+
type_="value",
|
192 |
+
# 不显示分割线
|
193 |
+
splitline_opts=opts.SplitLineOpts(is_show=False)),
|
194 |
+
yaxis_opts=opts.AxisOpts(
|
195 |
+
name='总价',
|
196 |
+
name_location='middle',
|
197 |
+
# 设置坐标轴为数值类型
|
198 |
+
type_="value",
|
199 |
+
# 默认为False表示起始为0
|
200 |
+
is_scale=True,
|
201 |
+
splitline_opts=opts.SplitLineOpts(is_show=False),),
|
202 |
+
visualmap_opts=opts.VisualMapOpts(is_show=True, type_='color', min_=100, max_=1000)
|
203 |
+
))
|
204 |
+
|
205 |
+
|
206 |
+
|
207 |
+
scatter.render_notebook()
|
208 |
+
|
209 |
+
|
210 |
+
# #### 各行政区均价
|
211 |
+
# * 🗣**最贵的是南山区,整体均价9.2W/平米;**
|
212 |
+
#
|
213 |
+
# * 🗣最便宜的是坪山区,均价3.5W/平米;
|
214 |
+
|
215 |
+
# In[53]:
|
216 |
+
|
217 |
+
|
218 |
+
temp = data.groupby(['area'])['unit_price'].mean().reset_index()
|
219 |
+
data_pair = [(row['area'], round(row['unit_price']/10000, 1)) for _, row in temp.iterrows()]
|
220 |
+
|
221 |
+
map_ = (Map(init_opts=opts.InitOpts(theme='dark'))
|
222 |
+
.add("二手房均价", data_pair, '深圳', is_roam=False)
|
223 |
+
.set_series_opts(label_opts=opts.LabelOpts(is_show=True))
|
224 |
+
.set_global_opts(
|
225 |
+
title_opts=opts.TitleOpts(title="深圳各行政区二手房均价"),
|
226 |
+
legend_opts=opts.LegendOpts(is_show=False),
|
227 |
+
tooltip_opts=opts.TooltipOpts(formatter='{b}:{c}万元'),
|
228 |
+
visualmap_opts=opts.VisualMapOpts(min_=3, max_=10)
|
229 |
+
)
|
230 |
+
)
|
231 |
+
|
232 |
+
|
233 |
+
# map_.render_notebook()
|
234 |
+
map_.render('map.html')
|
235 |
+
|
236 |
+
|
237 |
+
# #### 均价最高的10个小区
|
238 |
+
#
|
239 |
+
# * **🚫该小区内在售房源至少3套才统计**
|
240 |
+
|
241 |
+
# In[54]:
|
242 |
+
|
243 |
+
|
244 |
+
temp = data.groupby(['community'])['unit_price'].agg(['mean', 'count']).reset_index()
|
245 |
+
|
246 |
+
# 该小区内至少3套在售房源才统计
|
247 |
+
data_pair = sorted([(row['community'], round(row['mean']/10000, 1)) if row['count']>=3 else (0, 0)
|
248 |
+
for _, row in temp.iterrows()], key=lambda x: x[1], reverse=True)[:10]
|
249 |
+
|
250 |
+
bar = (Bar(init_opts=opts.InitOpts(theme='dark'))
|
251 |
+
.add_xaxis([x[0] for x in data_pair[::-1]])
|
252 |
+
.add_yaxis('二手房均价', [x[1] for x in data_pair[::-1]])
|
253 |
+
.set_series_opts(label_opts=opts.LabelOpts(is_show=True,
|
254 |
+
position='insideRight',
|
255 |
+
font_style='italic'),
|
256 |
+
itemstyle_opts=opts.ItemStyleOpts(
|
257 |
+
color=JsCode("""new echarts.graphic.LinearGradient(1, 0, 0, 0,
|
258 |
+
[{
|
259 |
+
offset: 0,
|
260 |
+
color: 'rgb(0,206,209)'
|
261 |
+
}, {
|
262 |
+
offset: 1,
|
263 |
+
color: 'rgb(218,165,32)'
|
264 |
+
}])"""))
|
265 |
+
)
|
266 |
+
.set_global_opts(
|
267 |
+
title_opts=opts.TitleOpts(title="深圳二手房均价TOP 10小区"),
|
268 |
+
legend_opts=opts.LegendOpts(is_show=False),
|
269 |
+
tooltip_opts=opts.TooltipOpts(formatter='{b}:{c}万元'),
|
270 |
+
xaxis_opts=opts.AxisOpts(min_=14),
|
271 |
+
)
|
272 |
+
.reversal_axis()
|
273 |
+
)
|
274 |
+
|
275 |
+
bar.render_notebook()
|
276 |
+
|
277 |
+
|
278 |
+
# #### 均价最高的10个地段
|
279 |
+
|
280 |
+
# In[55]:
|
281 |
+
|
282 |
+
|
283 |
+
temp = data.groupby(['position'])['unit_price'].mean().reset_index()
|
284 |
+
data_pair = sorted([(row['position'], round(row['unit_price']/10000, 1))
|
285 |
+
for _, row in temp.iterrows()], key=lambda x: x[1], reverse=True)[:10]
|
286 |
+
|
287 |
+
bar = (Bar(init_opts=opts.InitOpts(theme='dark'))
|
288 |
+
.add_xaxis([x[0] for x in data_pair])
|
289 |
+
.add_yaxis('二手房均价', [x[1] for x in data_pair])
|
290 |
+
.set_series_opts(label_opts=opts.LabelOpts(is_show=True, font_style='italic'),
|
291 |
+
itemstyle_opts=opts.ItemStyleOpts(
|
292 |
+
color=JsCode("""new echarts.graphic.LinearGradient(0, 1, 0, 0,
|
293 |
+
[{
|
294 |
+
offset: 0,
|
295 |
+
color: 'rgb(0,206,209)'
|
296 |
+
}, {
|
297 |
+
offset: 1,
|
298 |
+
color: 'rgb(218,165,32)'
|
299 |
+
}])"""))
|
300 |
+
)
|
301 |
+
.set_global_opts(
|
302 |
+
title_opts=opts.TitleOpts(title="深圳二手房均价TOP 10地段"),
|
303 |
+
legend_opts=opts.LegendOpts(is_show=False),
|
304 |
+
tooltip_opts=opts.TooltipOpts(formatter='{b}:{c}万元'))
|
305 |
+
)
|
306 |
+
|
307 |
+
bar.render_notebook()
|
308 |
+
|
309 |
+
|
310 |
+
# #### 户型分布
|
311 |
+
#
|
312 |
+
# * 🚁三室依然是主力;
|
313 |
+
#
|
314 |
+
# * 🚗在深圳这种寸土寸金的地方,小户型占比也很多;
|
315 |
+
|
316 |
+
# In[56]:
|
317 |
+
|
318 |
+
|
319 |
+
temp = data.groupby(['hourseType'])['area'].count().reset_index()
|
320 |
+
data_pair = sorted([(row['hourseType'], row['area'])
|
321 |
+
for _, row in temp.iterrows()], key=lambda x: x[1], reverse=True)[:10]
|
322 |
+
|
323 |
+
pie = (Pie(init_opts=opts.InitOpts(theme='dark'))
|
324 |
+
.add('', data_pair,
|
325 |
+
radius=["30%", "75%"],
|
326 |
+
rosetype="radius")
|
327 |
+
.set_global_opts(title_opts=opts.TitleOpts(title="深圳二手房 户型分布"),
|
328 |
+
legend_opts=opts.LegendOpts(is_show=False),)
|
329 |
+
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%"))
|
330 |
+
)
|
331 |
+
|
332 |
+
pie.render_notebook()
|
333 |
+
|
334 |
+
|
335 |
+
# #### 词云图
|
336 |
+
#
|
337 |
+
# * 来看看房源标题中出现最多的词语
|
338 |
+
|
339 |
+
# In[57]:
|
340 |
+
|
341 |
+
|
342 |
+
word_list = []
|
343 |
+
stop_words = ['花园','业主','出售']
|
344 |
+
string = str(''.join([i for i in data['title'] if isinstance(i, str)]))
|
345 |
+
|
346 |
+
words = psg.cut(string)
|
347 |
+
for x in words:
|
348 |
+
if len(x.word)==1:
|
349 |
+
pass
|
350 |
+
elif x.flag in ('m', 'x'):
|
351 |
+
pass
|
352 |
+
elif x.word in stop_words:
|
353 |
+
pass
|
354 |
+
else:
|
355 |
+
word_list.append(x.word)
|
356 |
+
|
357 |
+
|
358 |
+
# In[58]:
|
359 |
+
|
360 |
+
|
361 |
+
data_pair = collections.Counter(word_list).most_common(100)
|
362 |
+
|
363 |
+
|
364 |
+
wc = (WordCloud()
|
365 |
+
.add("", data_pair, word_size_range=[20, 100], shape='triangle')
|
366 |
+
.set_global_opts(title_opts=opts.TitleOpts(title="房源描述词云图"))
|
367 |
+
)
|
368 |
+
|
369 |
+
wc.render_notebook()
|
370 |
+
|
371 |
+
|
372 |
+
# In[ ]:
|
373 |
+
|
374 |
+
|
375 |
+
|
376 |
+
|