qingni commited on
Commit
b505fc0
1 Parent(s): d7ecb30

Upload 链家房源信息抓取+可视化.py

Browse files
Files changed (1) hide show
  1. 链家房源信息抓取+可视化.py +376 -0
链家房源信息抓取+可视化.py ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # <h1>Table of Contents<span class="tocSkip"></span></h1>
5
+ # <div class="toc"><ul class="toc-item"><li><span><a href="#前言" data-toc-modified-id="前言-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>前言</a></span></li><li><span><a href="#爬虫" data-toc-modified-id="爬虫-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>爬虫</a></span></li><li><span><a href="#数据可视化" data-toc-modified-id="数据可视化-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>数据可视化</a></span><ul class="toc-item"><li><span><a href="#房源面积-总价散点图" data-toc-modified-id="房源面积-总价散点图-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>房源面积-总价散点图</a></span></li><li><span><a href="#各行政区均价" data-toc-modified-id="各行政区均价-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>各行政区均价</a></span></li><li><span><a href="#均价最高的10个小区" data-toc-modified-id="均价最高的10个小区-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>均价最高的10个小区</a></span></li><li><span><a href="#均价最高的10个地段" data-toc-modified-id="均价最高的10个地段-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>均价最高的10个地段</a></span></li><li><span><a href="#户型分布" data-toc-modified-id="户型分布-3.5"><span class="toc-item-num">3.5&nbsp;&nbsp;</span>户型分布</a></span></li><li><span><a href="#词云图" data-toc-modified-id="词云图-3.6"><span class="toc-item-num">3.6&nbsp;&nbsp;</span>词云图</a></span></li></ul></li></ul></div>
6
+
7
+ #
8
+ # ### 前言
9
+ # **本项目总共分为两个部分:**
10
+ # * 爬虫:累计爬取链家深圳二手房源信息累计18906条 shenzhen.csv;
11
+ #
12
+ # * 数据可视化
13
+
14
+ # ### 爬虫
15
+ # * **爬取各个行政区房源信息;**
16
+ # * **数据保存为DataFrame;**
17
+
18
+ # In[2]:
19
+
20
+
21
+ from bs4 import BeautifulSoup
22
+ import pandas as pd
23
+ from tqdm import tqdm
24
+ import math
25
+ import requests
26
+ import lxml
27
+ import re
28
+ import time
29
+
30
+
31
+ # In[44]:
32
+
33
+
34
+ area_dic = {'罗湖区':'luohuqu',
35
+ '福田区':'futianqu',
36
+ '南山区':'nanshanqu',
37
+ '盐田区':'yantianqu',
38
+ '宝安区':'baoanqu',
39
+ '龙岗区':'longgangqu',
40
+ '龙华区':'longhuaqu',
41
+ '坪山区':'pingshanqu'}
42
+ area_dic = {'罗湖区':'luohuqu'}
43
+
44
+ # 加个header以示尊敬
45
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
46
+ 'Referer': 'https://sz.lianjia.com/ershoufang/'}
47
+
48
+ # 新建一个会话
49
+ sess = requests.session()
50
+ sess.get('https://sz.lianjia.com/ershoufang/', headers=headers)
51
+
52
+ # url示例:https://sz.lianjia.com/ershoufang/luohuqu/pg2/
53
+ url = 'https://sz.lianjia.com/ershoufang/{}/pg{}/'
54
+
55
+
56
+ # In[45]:
57
+
58
+
59
+ # 当正则表达式匹配失败时,返回默认值(errif)
60
+ def re_match(re_pattern, string, errif=None):
61
+ try:
62
+ return re.findall(re_pattern, string)[0].strip()
63
+ except IndexError:
64
+ return errif
65
+
66
+
67
+ # In[46]:
68
+
69
+
70
+ import time
71
+
72
+ #新建一个DataFrame存储信息
73
+ data = pd.DataFrame()
74
+
75
+ for key_, value_ in area_dic.items():
76
+ # 获取该行政区下房源记录数
77
+ start_url = 'https://sz.lianjia.com/ershoufang/{}/'.format(value_)
78
+ html = sess.get(start_url).text
79
+ house_num = re.findall('共找到<span> (.*?) </span>套.*二手房', html)[0].strip()
80
+ print('{}: 二手房源共计「{}」套'.format(key_, house_num))
81
+ time.sleep(1)
82
+ # 页面限制 每个行政区只能获取最多100页共计3000条房源信息
83
+ total_page = int(math.ceil(min(3000, int(house_num)) / 30.0))
84
+
85
+ for i in tqdm(range(total_page), desc=key_):
86
+ #print('开始抓取',url.format(value_, i+1))
87
+ html = sess.get(url.format(value_, i+1)).text
88
+ soup = BeautifulSoup(html, 'lxml')
89
+ info_collect = soup.find_all(class_="info clear")
90
+
91
+ for info in info_collect:
92
+ info_dic = {}
93
+ # 行政区
94
+ info_dic['area'] = key_
95
+ # 房源的标题
96
+ info_dic['title'] = re_match('target="_blank">(.*?)</a><!--', str(info))
97
+ # 小区名
98
+ info_dic['community'] = re_match('xiaoqu.*?target="_blank">(.*?)</a>', str(info))
99
+ # 位置
100
+ info_dic['position'] = re_match('<a href.*?target="_blank">(.*?)</a>.*?class="address">', str(info))
101
+ # 税相关,如房本满5年
102
+ info_dic['tax'] = re_match('class="taxfree">(.*?)</span>', str(info))
103
+ try:
104
+ # 总价
105
+ info_dic['total_price'] = float(re_match('<span class="">(.*?)</span><i>万', str(info)))
106
+ except:
107
+ info_dic['total_price'] =None
108
+ # 单价
109
+ try:
110
+ info_dic['unit_price'] = float(re_match('<span>(.*?)元/平</span>', str(info)).replace(',',''))
111
+ except:
112
+ info_dic['unit_price'] =None
113
+
114
+ # 匹配房源标签信息,通过|切割
115
+ # 包括面积,朝向,装修等信息
116
+ icons = re.findall('class="houseIcon"></span>(.*?)</div>', str(info))[0].strip().split('|')
117
+ info_dic['hourseType'] = icons[0].strip()
118
+ info_dic['hourseSize'] = float(icons[1].replace('平米', ''))
119
+ info_dic['direction'] = icons[2].strip()
120
+ info_dic['fitment'] = icons[3].strip()
121
+
122
+ # 存入DataFrame
123
+ if data.empty:
124
+ data = pd.DataFrame(info_dic,index=[0])
125
+ else:
126
+ data1 = pd.DataFrame(info_dic,index=[0])
127
+ data = pd.concat([data,data1],ignore_index=True)
128
+
129
+
130
+
131
+
132
+ # In[47]:
133
+
134
+
135
+ # 去掉一条面积10000+平米的房源记录
136
+ data = data[data['hourseSize'] < 10000]
137
+ data.head()
138
+
139
+
140
+ # In[14]:
141
+
142
+
143
+ # 保存数据
144
+ # data.to_csv('./shenzhen2.csv')
145
+
146
+
147
+ # ### 数据可视化
148
+ # * 房源面积-总价散点图
149
+ # * 各行政区均价
150
+ # * 均价最贵的10个地段
151
+ # * 均价最贵的10个小区
152
+ # * 户型分布
153
+ # * 标题文本词云图
154
+
155
+ # In[48]:
156
+
157
+
158
+ from pyecharts.charts import *
159
+ from pyecharts import options as opts
160
+ from pyecharts.commons.utils import JsCode
161
+ from jieba import posseg as psg
162
+ import collections
163
+ import pandas as pd
164
+
165
+
166
+ # In[51]:
167
+
168
+
169
+ data = pd.read_csv('./shenzhen.csv')
170
+ data = data.drop(columns=['Unnamed: 0'])
171
+ data.head()
172
+
173
+
174
+ # #### 房源面积-总价散点图
175
+ # * 💥最贵的一套房源是位于宝安-曦城的房源,总价8800W;
176
+
177
+ # In[52]:
178
+
179
+
180
+ scatter = (Scatter(init_opts=opts.InitOpts(theme='dark'))
181
+ .add_xaxis(data['hourseSize'])
182
+ .add_yaxis("房价", data['total_price'])
183
+ .set_series_opts(label_opts=opts.LabelOpts(is_show=False),
184
+ markpoint_opts=opts.MarkPointOpts(data=[opts.MarkPointItem(type_="max", name="最大值"),]))
185
+ .set_global_opts(
186
+ legend_opts=opts.LegendOpts(is_show=False),
187
+ title_opts=opts.TitleOpts(title="深圳二手房 总价-面积 散点图"),
188
+ xaxis_opts=opts.AxisOpts(
189
+ name='面积',
190
+ # 设置坐标轴为数值类型
191
+ type_="value",
192
+ # 不显示分割线
193
+ splitline_opts=opts.SplitLineOpts(is_show=False)),
194
+ yaxis_opts=opts.AxisOpts(
195
+ name='总价',
196
+ name_location='middle',
197
+ # 设置坐标轴为数值类型
198
+ type_="value",
199
+ # 默认为False表示起始为0
200
+ is_scale=True,
201
+ splitline_opts=opts.SplitLineOpts(is_show=False),),
202
+ visualmap_opts=opts.VisualMapOpts(is_show=True, type_='color', min_=100, max_=1000)
203
+ ))
204
+
205
+
206
+
207
+ scatter.render_notebook()
208
+
209
+
210
+ # #### 各行政区均价
211
+ # * 🗣**最贵的是南山区,整体均价9.2W/平米;**
212
+ #
213
+ # * 🗣最便宜的是坪山区,均价3.5W/平米;
214
+
215
+ # In[53]:
216
+
217
+
218
+ temp = data.groupby(['area'])['unit_price'].mean().reset_index()
219
+ data_pair = [(row['area'], round(row['unit_price']/10000, 1)) for _, row in temp.iterrows()]
220
+
221
+ map_ = (Map(init_opts=opts.InitOpts(theme='dark'))
222
+ .add("二手房均价", data_pair, '深圳', is_roam=False)
223
+ .set_series_opts(label_opts=opts.LabelOpts(is_show=True))
224
+ .set_global_opts(
225
+ title_opts=opts.TitleOpts(title="深圳各行政区二手房均价"),
226
+ legend_opts=opts.LegendOpts(is_show=False),
227
+ tooltip_opts=opts.TooltipOpts(formatter='{b}:{c}万元'),
228
+ visualmap_opts=opts.VisualMapOpts(min_=3, max_=10)
229
+ )
230
+ )
231
+
232
+
233
+ # map_.render_notebook()
234
+ map_.render('map.html')
235
+
236
+
237
+ # #### 均价最高的10个小区
238
+ #
239
+ # * **🚫该小区内在售房源至少3套才统计**
240
+
241
+ # In[54]:
242
+
243
+
244
+ temp = data.groupby(['community'])['unit_price'].agg(['mean', 'count']).reset_index()
245
+
246
+ # 该小区内至少3套在售房源才统计
247
+ data_pair = sorted([(row['community'], round(row['mean']/10000, 1)) if row['count']>=3 else (0, 0)
248
+ for _, row in temp.iterrows()], key=lambda x: x[1], reverse=True)[:10]
249
+
250
+ bar = (Bar(init_opts=opts.InitOpts(theme='dark'))
251
+ .add_xaxis([x[0] for x in data_pair[::-1]])
252
+ .add_yaxis('二手房均价', [x[1] for x in data_pair[::-1]])
253
+ .set_series_opts(label_opts=opts.LabelOpts(is_show=True,
254
+ position='insideRight',
255
+ font_style='italic'),
256
+ itemstyle_opts=opts.ItemStyleOpts(
257
+ color=JsCode("""new echarts.graphic.LinearGradient(1, 0, 0, 0,
258
+ [{
259
+ offset: 0,
260
+ color: 'rgb(0,206,209)'
261
+ }, {
262
+ offset: 1,
263
+ color: 'rgb(218,165,32)'
264
+ }])"""))
265
+ )
266
+ .set_global_opts(
267
+ title_opts=opts.TitleOpts(title="深圳二手房均价TOP 10小区"),
268
+ legend_opts=opts.LegendOpts(is_show=False),
269
+ tooltip_opts=opts.TooltipOpts(formatter='{b}:{c}万元'),
270
+ xaxis_opts=opts.AxisOpts(min_=14),
271
+ )
272
+ .reversal_axis()
273
+ )
274
+
275
+ bar.render_notebook()
276
+
277
+
278
+ # #### 均价最高的10个地段
279
+
280
+ # In[55]:
281
+
282
+
283
+ temp = data.groupby(['position'])['unit_price'].mean().reset_index()
284
+ data_pair = sorted([(row['position'], round(row['unit_price']/10000, 1))
285
+ for _, row in temp.iterrows()], key=lambda x: x[1], reverse=True)[:10]
286
+
287
+ bar = (Bar(init_opts=opts.InitOpts(theme='dark'))
288
+ .add_xaxis([x[0] for x in data_pair])
289
+ .add_yaxis('二手房均价', [x[1] for x in data_pair])
290
+ .set_series_opts(label_opts=opts.LabelOpts(is_show=True, font_style='italic'),
291
+ itemstyle_opts=opts.ItemStyleOpts(
292
+ color=JsCode("""new echarts.graphic.LinearGradient(0, 1, 0, 0,
293
+ [{
294
+ offset: 0,
295
+ color: 'rgb(0,206,209)'
296
+ }, {
297
+ offset: 1,
298
+ color: 'rgb(218,165,32)'
299
+ }])"""))
300
+ )
301
+ .set_global_opts(
302
+ title_opts=opts.TitleOpts(title="深圳二手房均价TOP 10地段"),
303
+ legend_opts=opts.LegendOpts(is_show=False),
304
+ tooltip_opts=opts.TooltipOpts(formatter='{b}:{c}万元'))
305
+ )
306
+
307
+ bar.render_notebook()
308
+
309
+
310
+ # #### 户型分布
311
+ #
312
+ # * 🚁三室依然是主力;
313
+ #
314
+ # * 🚗在深圳这种寸土寸金的地方,小户型占比也很多;
315
+
316
+ # In[56]:
317
+
318
+
319
+ temp = data.groupby(['hourseType'])['area'].count().reset_index()
320
+ data_pair = sorted([(row['hourseType'], row['area'])
321
+ for _, row in temp.iterrows()], key=lambda x: x[1], reverse=True)[:10]
322
+
323
+ pie = (Pie(init_opts=opts.InitOpts(theme='dark'))
324
+ .add('', data_pair,
325
+ radius=["30%", "75%"],
326
+ rosetype="radius")
327
+ .set_global_opts(title_opts=opts.TitleOpts(title="深圳二手房 户型分布"),
328
+ legend_opts=opts.LegendOpts(is_show=False),)
329
+ .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%"))
330
+ )
331
+
332
+ pie.render_notebook()
333
+
334
+
335
+ # #### 词云图
336
+ #
337
+ # * 来看看房源标题中出现最多的词语
338
+
339
+ # In[57]:
340
+
341
+
342
+ word_list = []
343
+ stop_words = ['花园','业主','出售']
344
+ string = str(''.join([i for i in data['title'] if isinstance(i, str)]))
345
+
346
+ words = psg.cut(string)
347
+ for x in words:
348
+ if len(x.word)==1:
349
+ pass
350
+ elif x.flag in ('m', 'x'):
351
+ pass
352
+ elif x.word in stop_words:
353
+ pass
354
+ else:
355
+ word_list.append(x.word)
356
+
357
+
358
+ # In[58]:
359
+
360
+
361
+ data_pair = collections.Counter(word_list).most_common(100)
362
+
363
+
364
+ wc = (WordCloud()
365
+ .add("", data_pair, word_size_range=[20, 100], shape='triangle')
366
+ .set_global_opts(title_opts=opts.TitleOpts(title="房源描述词云图"))
367
+ )
368
+
369
+ wc.render_notebook()
370
+
371
+
372
+ # In[ ]:
373
+
374
+
375
+
376
+