commit
Browse files- .gitignore +2 -0
- app.py +2 -1
- dataup/world_countries.json +0 -0
- scrapy_lianjia1.py +82 -0
- test.ipynb +66 -0
.gitignore
CHANGED
@@ -6,3 +6,5 @@
|
|
6 |
*.gif
|
7 |
Untitled-1.md
|
8 |
Untitled-2.sqlbook
|
|
|
|
|
|
6 |
*.gif
|
7 |
Untitled-1.md
|
8 |
Untitled-2.sqlbook
|
9 |
+
/ASRT_v1.3.0/
|
10 |
+
/ASRT_v1.3.0/assets/
|
app.py
CHANGED
@@ -20,7 +20,7 @@ warnings.filterwarnings('ignore')
|
|
20 |
st.set_page_config(
|
21 |
page_title="simingyanwebpage",
|
22 |
page_icon="🧊",
|
23 |
-
layout="
|
24 |
initial_sidebar_state="expanded",
|
25 |
menu_items={
|
26 |
'About': "fyenne@hotmail.com"
|
@@ -31,6 +31,7 @@ st.set_page_config(
|
|
31 |
# main page #
|
32 |
# ============================================================================ #
|
33 |
|
|
|
34 |
class load_data:
|
35 |
'''
|
36 |
read some data
|
|
|
20 |
st.set_page_config(
|
21 |
page_title="simingyanwebpage",
|
22 |
page_icon="🧊",
|
23 |
+
layout="centered",
|
24 |
initial_sidebar_state="expanded",
|
25 |
menu_items={
|
26 |
'About': "fyenne@hotmail.com"
|
|
|
31 |
# main page #
|
32 |
# ============================================================================ #
|
33 |
|
34 |
+
@st.cache # 👈 Added this
|
35 |
class load_data:
|
36 |
'''
|
37 |
read some data
|
dataup/world_countries.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
scrapy_lianjia1.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import scrapy
|
3 |
+
import csv
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
|
7 |
+
class LianjiaCrawlSpider(scrapy.Spider):
|
8 |
+
name = 'lianjia_crawl'
|
9 |
+
allowed_domains = ['sh.lianjia.com/zufang']
|
10 |
+
start_urls = ['https://sh.lianjia.com/zufang/']
|
11 |
+
# custom_settings={
|
12 |
+
# 'CLOSESPIDER_ERRORCOUNT': 1
|
13 |
+
# }
|
14 |
+
|
15 |
+
def __init__(self):
|
16 |
+
self.file = open('lianjia.csv', 'w',newline='')
|
17 |
+
self.writer = csv.writer(self.file, dialect="excel")
|
18 |
+
self.writer.writerow(['形式','小区名','区','地点','面积','朝向','户型','标签','租金','经度','纬度'])
|
19 |
+
|
20 |
+
def start_requests(self):
|
21 |
+
yield scrapy.Request(url='https://sh.lianjia.com/zufang',callback=self.parse_district)
|
22 |
+
|
23 |
+
def parse_district(self, response):
|
24 |
+
li_all = response.css('ul[data-target="area"]')
|
25 |
+
districts = li_all.css('li a::attr(href)').extract()[1:]
|
26 |
+
for district in districts:
|
27 |
+
url_district='https://sh.lianjia.com'+str(district)
|
28 |
+
yield scrapy.Request(url=url_district,callback=self.parse_block,dont_filter=True)
|
29 |
+
|
30 |
+
def parse_block(self, response):
|
31 |
+
li_all = response.css('ul[data-target="area"]')
|
32 |
+
blocks = li_all[1].css('li.filter__item--level3 a::attr(href)').extract()[1:]
|
33 |
+
for block in blocks:
|
34 |
+
url_block='https://sh.lianjia.com'+str(block)
|
35 |
+
yield scrapy.Request(url=url_block,callback=self.parse_page,dont_filter=True)
|
36 |
+
|
37 |
+
def parse_page(self, response):
|
38 |
+
url_block = response.url
|
39 |
+
try:
|
40 |
+
max_page = int(response.css('div.content__pg::attr(data-totalpage)').extract()[0])
|
41 |
+
except:
|
42 |
+
max_page = 1
|
43 |
+
for page in range(1,max_page+1):
|
44 |
+
url_page=url_block+'pg'+str(page)+'/#contentList'
|
45 |
+
yield scrapy.Request(url=url_page,callback=self.parse_dtl,dont_filter=True)
|
46 |
+
|
47 |
+
def parse_dtl(self, response):
|
48 |
+
li_all = response.css('div.content__list--item')
|
49 |
+
for i in li_all:
|
50 |
+
try:
|
51 |
+
item = dict()
|
52 |
+
item['title'] = i.css('p.content__list--item--title.twoline a::text').extract()[0].strip().split(' ')[0].split('·')
|
53 |
+
item['loc'] = i.css('p.content__list--item--des a::text').extract()
|
54 |
+
dtl = i.css('p.content__list--item--des::text').extract()[4:-1]
|
55 |
+
dtl=[x.strip() for x in dtl]
|
56 |
+
if len(dtl)==3:
|
57 |
+
item['area'], item['orientation'], item['house_type'] = dtl[0], dtl[1], dtl[2]
|
58 |
+
elif len(dtl)==2:
|
59 |
+
item['area'], item['orientation'], item['house_type'] = dtl[0], '', dtl[1]
|
60 |
+
item['tag'] = '/'.join(i.css('p.content__list--item--bottom.oneline i::text').extract())
|
61 |
+
item['price'] = i.css('span.content__list--item-price em::text').extract()[0]
|
62 |
+
url_dtl = 'https://sh.lianjia.com'+i.css('p.content__list--item--title.twoline a::attr(href)').extract()[0]
|
63 |
+
yield scrapy.Request(url=url_dtl,meta={'item':item},callback=self.parse_ll,dont_filter=True)
|
64 |
+
except:
|
65 |
+
pass
|
66 |
+
|
67 |
+
def parse_ll(self, response):
|
68 |
+
item = response.meta['item']
|
69 |
+
html = response.text
|
70 |
+
longtitude = re.findall('longitude(.+)',html)[0]
|
71 |
+
item['longtitude'] = re.findall('(\d+.\d+)',longtitude)[0].strip()
|
72 |
+
latitude = re.findall('latitude(.+)',html)[0]
|
73 |
+
item['latitude'] = re.findall('(\d+.\d+)',latitude)[0].strip()
|
74 |
+
|
75 |
+
self.writer.writerow(
|
76 |
+
[item['title'][0],item['title'][1],item['loc'][0],item['loc'][1],item['area'],item['orientation'],item['house_type'],item['tag'],item['price'],item['longtitude'],item['latitude']]
|
77 |
+
)
|
78 |
+
|
79 |
+
self.file.flush()
|
80 |
+
os.fsync(self.file)
|
81 |
+
|
82 |
+
print("over: " + response.url)
|
test.ipynb
CHANGED
@@ -315,6 +315,72 @@
|
|
315 |
" index=range(len(message))).head(20)"
|
316 |
]
|
317 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
{
|
319 |
"cell_type": "code",
|
320 |
"execution_count": null,
|
|
|
315 |
" index=range(len(message))).head(20)"
|
316 |
]
|
317 |
},
|
318 |
+
{
|
319 |
+
"cell_type": "code",
|
320 |
+
"execution_count": 51,
|
321 |
+
"metadata": {},
|
322 |
+
"outputs": [],
|
323 |
+
"source": [
|
324 |
+
"import requests \n",
|
325 |
+
"from bs4 import BeautifulSoup\n",
|
326 |
+
"# fetch the page to train\n",
|
327 |
+
"einstein_url = 'http://quotes.toscrape.com/author/Albert-Einstein/'\n",
|
328 |
+
"resp = requests.get(einstein_url, timeout = 5)\n",
|
329 |
+
"# resp\n",
|
330 |
+
"assert resp.status_code == 200"
|
331 |
+
]
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"cell_type": "code",
|
335 |
+
"execution_count": 54,
|
336 |
+
"metadata": {},
|
337 |
+
"outputs": [],
|
338 |
+
"source": [
|
339 |
+
"soup = BeautifulSoup(resp.content)"
|
340 |
+
]
|
341 |
+
},
|
342 |
+
{
|
343 |
+
"cell_type": "code",
|
344 |
+
"execution_count": 125,
|
345 |
+
"metadata": {},
|
346 |
+
"outputs": [
|
347 |
+
{
|
348 |
+
"data": {
|
349 |
+
"text/plain": [
|
350 |
+
"<p>\n",
|
351 |
+
"<a href=\"/login\">Login</a>\n",
|
352 |
+
"</p>"
|
353 |
+
]
|
354 |
+
},
|
355 |
+
"execution_count": 125,
|
356 |
+
"metadata": {},
|
357 |
+
"output_type": "execute_result"
|
358 |
+
}
|
359 |
+
],
|
360 |
+
"source": [
|
361 |
+
"soup.find()"
|
362 |
+
]
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"cell_type": "code",
|
366 |
+
"execution_count": 126,
|
367 |
+
"metadata": {},
|
368 |
+
"outputs": [
|
369 |
+
{
|
370 |
+
"data": {
|
371 |
+
"text/plain": [
|
372 |
+
"{'href': '/', 'style': 'text-decoration: none'}"
|
373 |
+
]
|
374 |
+
},
|
375 |
+
"execution_count": 126,
|
376 |
+
"metadata": {},
|
377 |
+
"output_type": "execute_result"
|
378 |
+
}
|
379 |
+
],
|
380 |
+
"source": [
|
381 |
+
"soup.a.attrs"
|
382 |
+
]
|
383 |
+
},
|
384 |
{
|
385 |
"cell_type": "code",
|
386 |
"execution_count": null,
|