fyenne commited on
Commit
9a0691a
1 Parent(s): 2650203
Files changed (5) hide show
  1. .gitignore +2 -0
  2. app.py +2 -1
  3. dataup/world_countries.json +0 -0
  4. scrapy_lianjia1.py +82 -0
  5. test.ipynb +66 -0
.gitignore CHANGED
@@ -6,3 +6,5 @@
6
  *.gif
7
  Untitled-1.md
8
  Untitled-2.sqlbook
 
 
 
6
  *.gif
7
  Untitled-1.md
8
  Untitled-2.sqlbook
9
+ /ASRT_v1.3.0/
10
+ /ASRT_v1.3.0/assets/
app.py CHANGED
@@ -20,7 +20,7 @@ warnings.filterwarnings('ignore')
20
  st.set_page_config(
21
  page_title="simingyanwebpage",
22
  page_icon="🧊",
23
- layout="wide",
24
  initial_sidebar_state="expanded",
25
  menu_items={
26
  'About': "fyenne@hotmail.com"
@@ -31,6 +31,7 @@ st.set_page_config(
31
  # main page #
32
  # ============================================================================ #
33
 
 
34
  class load_data:
35
  '''
36
  read some data
 
20
  st.set_page_config(
21
  page_title="simingyanwebpage",
22
  page_icon="🧊",
23
+ layout="centered",
24
  initial_sidebar_state="expanded",
25
  menu_items={
26
  'About': "fyenne@hotmail.com"
 
31
  # main page #
32
  # ============================================================================ #
33
 
34
+ @st.cache # 👈 Added this
35
  class load_data:
36
  '''
37
  read some data
dataup/world_countries.json ADDED
The diff for this file is too large to render. See raw diff
 
scrapy_lianjia1.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import scrapy
3
+ import csv
4
+ import os
5
+ import re
6
+
7
+ class LianjiaCrawlSpider(scrapy.Spider):
8
+ name = 'lianjia_crawl'
9
+ allowed_domains = ['sh.lianjia.com/zufang']
10
+ start_urls = ['https://sh.lianjia.com/zufang/']
11
+ # custom_settings={
12
+ # 'CLOSESPIDER_ERRORCOUNT': 1
13
+ # }
14
+
15
+ def __init__(self):
16
+ self.file = open('lianjia.csv', 'w',newline='')
17
+ self.writer = csv.writer(self.file, dialect="excel")
18
+ self.writer.writerow(['形式','小区名','区','地点','面积','朝向','户型','标签','租金','经度','纬度'])
19
+
20
+ def start_requests(self):
21
+ yield scrapy.Request(url='https://sh.lianjia.com/zufang',callback=self.parse_district)
22
+
23
+ def parse_district(self, response):
24
+ li_all = response.css('ul[data-target="area"]')
25
+ districts = li_all.css('li a::attr(href)').extract()[1:]
26
+ for district in districts:
27
+ url_district='https://sh.lianjia.com'+str(district)
28
+ yield scrapy.Request(url=url_district,callback=self.parse_block,dont_filter=True)
29
+
30
+ def parse_block(self, response):
31
+ li_all = response.css('ul[data-target="area"]')
32
+ blocks = li_all[1].css('li.filter__item--level3 a::attr(href)').extract()[1:]
33
+ for block in blocks:
34
+ url_block='https://sh.lianjia.com'+str(block)
35
+ yield scrapy.Request(url=url_block,callback=self.parse_page,dont_filter=True)
36
+
37
+ def parse_page(self, response):
38
+ url_block = response.url
39
+ try:
40
+ max_page = int(response.css('div.content__pg::attr(data-totalpage)').extract()[0])
41
+ except:
42
+ max_page = 1
43
+ for page in range(1,max_page+1):
44
+ url_page=url_block+'pg'+str(page)+'/#contentList'
45
+ yield scrapy.Request(url=url_page,callback=self.parse_dtl,dont_filter=True)
46
+
47
+ def parse_dtl(self, response):
48
+ li_all = response.css('div.content__list--item')
49
+ for i in li_all:
50
+ try:
51
+ item = dict()
52
+ item['title'] = i.css('p.content__list--item--title.twoline a::text').extract()[0].strip().split(' ')[0].split('·')
53
+ item['loc'] = i.css('p.content__list--item--des a::text').extract()
54
+ dtl = i.css('p.content__list--item--des::text').extract()[4:-1]
55
+ dtl=[x.strip() for x in dtl]
56
+ if len(dtl)==3:
57
+ item['area'], item['orientation'], item['house_type'] = dtl[0], dtl[1], dtl[2]
58
+ elif len(dtl)==2:
59
+ item['area'], item['orientation'], item['house_type'] = dtl[0], '', dtl[1]
60
+ item['tag'] = '/'.join(i.css('p.content__list--item--bottom.oneline i::text').extract())
61
+ item['price'] = i.css('span.content__list--item-price em::text').extract()[0]
62
+ url_dtl = 'https://sh.lianjia.com'+i.css('p.content__list--item--title.twoline a::attr(href)').extract()[0]
63
+ yield scrapy.Request(url=url_dtl,meta={'item':item},callback=self.parse_ll,dont_filter=True)
64
+ except:
65
+ pass
66
+
67
+ def parse_ll(self, response):
68
+ item = response.meta['item']
69
+ html = response.text
70
+ longtitude = re.findall('longitude(.+)',html)[0]
71
+ item['longtitude'] = re.findall('(\d+.\d+)',longtitude)[0].strip()
72
+ latitude = re.findall('latitude(.+)',html)[0]
73
+ item['latitude'] = re.findall('(\d+.\d+)',latitude)[0].strip()
74
+
75
+ self.writer.writerow(
76
+ [item['title'][0],item['title'][1],item['loc'][0],item['loc'][1],item['area'],item['orientation'],item['house_type'],item['tag'],item['price'],item['longtitude'],item['latitude']]
77
+ )
78
+
79
+ self.file.flush()
80
+ os.fsync(self.file)
81
+
82
+ print("over: " + response.url)
test.ipynb CHANGED
@@ -315,6 +315,72 @@
315
  " index=range(len(message))).head(20)"
316
  ]
317
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  {
319
  "cell_type": "code",
320
  "execution_count": null,
 
315
  " index=range(len(message))).head(20)"
316
  ]
317
  },
318
+ {
319
+ "cell_type": "code",
320
+ "execution_count": 51,
321
+ "metadata": {},
322
+ "outputs": [],
323
+ "source": [
324
+ "import requests \n",
325
+ "from bs4 import BeautifulSoup\n",
326
+ "# fetch the page to train\n",
327
+ "einstein_url = 'http://quotes.toscrape.com/author/Albert-Einstein/'\n",
328
+ "resp = requests.get(einstein_url, timeout = 5)\n",
329
+ "# resp\n",
330
+ "assert resp.status_code == 200"
331
+ ]
332
+ },
333
+ {
334
+ "cell_type": "code",
335
+ "execution_count": 54,
336
+ "metadata": {},
337
+ "outputs": [],
338
+ "source": [
339
+ "soup = BeautifulSoup(resp.content)"
340
+ ]
341
+ },
342
+ {
343
+ "cell_type": "code",
344
+ "execution_count": 125,
345
+ "metadata": {},
346
+ "outputs": [
347
+ {
348
+ "data": {
349
+ "text/plain": [
350
+ "<p>\n",
351
+ "<a href=\"/login\">Login</a>\n",
352
+ "</p>"
353
+ ]
354
+ },
355
+ "execution_count": 125,
356
+ "metadata": {},
357
+ "output_type": "execute_result"
358
+ }
359
+ ],
360
+ "source": [
361
+ "soup.find()"
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "code",
366
+ "execution_count": 126,
367
+ "metadata": {},
368
+ "outputs": [
369
+ {
370
+ "data": {
371
+ "text/plain": [
372
+ "{'href': '/', 'style': 'text-decoration: none'}"
373
+ ]
374
+ },
375
+ "execution_count": 126,
376
+ "metadata": {},
377
+ "output_type": "execute_result"
378
+ }
379
+ ],
380
+ "source": [
381
+ "soup.a.attrs"
382
+ ]
383
+ },
384
  {
385
  "cell_type": "code",
386
  "execution_count": null,