Spaces:

fyenne
/

stlit

Runtime error

App Files Files Community

fyenne commited on Jul 27, 2022

Commit

9a0691a

•

1 Parent(s): 2650203

commit

Browse files

Files changed (5) hide show

.gitignore +2 -0
app.py +2 -1
dataup/world_countries.json +0 -0
scrapy_lianjia1.py +82 -0
test.ipynb +66 -0

.gitignore CHANGED Viewed

@@ -6,3 +6,5 @@
 *.gif
 Untitled-1.md
 Untitled-2.sqlbook

 *.gif
 Untitled-1.md
 Untitled-2.sqlbook
+/ASRT_v1.3.0/
+/ASRT_v1.3.0/assets/

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ warnings.filterwarnings('ignore')
 st.set_page_config(
     page_title="simingyanwebpage",
     page_icon="🧊",
-    layout="wide",
     initial_sidebar_state="expanded",
     menu_items={
         'About': "fyenne@hotmail.com"
@@ -31,6 +31,7 @@ st.set_page_config(
 #                                   main page                                  #
 # ============================================================================ #
 class load_data:
     '''
     read some data

 st.set_page_config(
     page_title="simingyanwebpage",
     page_icon="🧊",
+    layout="centered",
     initial_sidebar_state="expanded",
     menu_items={
         'About': "fyenne@hotmail.com"
 #                                   main page                                  #
 # ============================================================================ #
+@st.cache  # 👈 Added this
 class load_data:
     '''
     read some data

dataup/world_countries.json ADDED Viewed

The diff for this file is too large to render. See raw diff

scrapy_lianjia1.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# -*- coding: utf-8 -*-
+import scrapy
+import csv
+import os
+import re
+class LianjiaCrawlSpider(scrapy.Spider):
+    name = 'lianjia_crawl'
+    allowed_domains = ['sh.lianjia.com/zufang']
+    start_urls = ['https://sh.lianjia.com/zufang/']
+    # custom_settings={
+    #     'CLOSESPIDER_ERRORCOUNT': 1
+    # }
+    def __init__(self):
+        self.file = open('lianjia.csv', 'w',newline='')
+        self.writer = csv.writer(self.file, dialect="excel")
+        self.writer.writerow(['形式','小区名','区','地点','面积','朝向','户型','标签','租金','经度','纬度'])
+    def start_requests(self):
+        yield scrapy.Request(url='https://sh.lianjia.com/zufang',callback=self.parse_district)
+    def parse_district(self, response):
+        li_all = response.css('ul[data-target="area"]')
+        districts = li_all.css('li a::attr(href)').extract()[1:]
+        for district in districts:
+            url_district='https://sh.lianjia.com'+str(district)
+            yield scrapy.Request(url=url_district,callback=self.parse_block,dont_filter=True)
+    def parse_block(self, response):
+        li_all = response.css('ul[data-target="area"]')
+        blocks = li_all[1].css('li.filter__item--level3  a::attr(href)').extract()[1:]
+        for block in blocks:
+            url_block='https://sh.lianjia.com'+str(block)
+            yield scrapy.Request(url=url_block,callback=self.parse_page,dont_filter=True)
+    def parse_page(self, response):
+        url_block = response.url
+        try:
+            max_page = int(response.css('div.content__pg::attr(data-totalpage)').extract()[0])
+        except:
+            max_page = 1
+        for page in range(1,max_page+1):
+            url_page=url_block+'pg'+str(page)+'/#contentList'
+            yield scrapy.Request(url=url_page,callback=self.parse_dtl,dont_filter=True)
+    def parse_dtl(self, response):
+        li_all = response.css('div.content__list--item')
+        for i in li_all:
+            try:
+                item = dict()
+                item['title'] = i.css('p.content__list--item--title.twoline a::text').extract()[0].strip().split(' ')[0].split('·')
+                item['loc'] = i.css('p.content__list--item--des a::text').extract()
+                dtl = i.css('p.content__list--item--des::text').extract()[4:-1]
+                dtl=[x.strip() for x in dtl]
+                if len(dtl)==3:
+                    item['area'], item['orientation'], item['house_type'] = dtl[0], dtl[1], dtl[2]
+                elif len(dtl)==2:
+                    item['area'], item['orientation'], item['house_type'] = dtl[0], '', dtl[1]
+                    item['tag'] = '/'.join(i.css('p.content__list--item--bottom.oneline i::text').extract())
+                    item['price'] = i.css('span.content__list--item-price em::text').extract()[0]
+                    url_dtl = 'https://sh.lianjia.com'+i.css('p.content__list--item--title.twoline a::attr(href)').extract()[0]
+                    yield scrapy.Request(url=url_dtl,meta={'item':item},callback=self.parse_ll,dont_filter=True)
+            except:
+                pass
+    def parse_ll(self, response):
+        item = response.meta['item']
+        html = response.text
+        longtitude = re.findall('longitude(.+)',html)[0]
+        item['longtitude'] = re.findall('(\d+.\d+)',longtitude)[0].strip()
+        latitude = re.findall('latitude(.+)',html)[0]
+        item['latitude'] = re.findall('(\d+.\d+)',latitude)[0].strip()
+        self.writer.writerow(
+            [item['title'][0],item['title'][1],item['loc'][0],item['loc'][1],item['area'],item['orientation'],item['house_type'],item['tag'],item['price'],item['longtitude'],item['latitude']]
+            )
+        self.file.flush()
+        os.fsync(self.file)
+        print("over: " + response.url)

test.ipynb CHANGED Viewed

@@ -315,6 +315,72 @@
     "            index=range(len(message))).head(20)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,

     "            index=range(len(message))).head(20)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests  \n",
+    "from bs4 import BeautifulSoup\n",
+    "# fetch the page to train\n",
+    "einstein_url = 'http://quotes.toscrape.com/author/Albert-Einstein/'\n",
+    "resp = requests.get(einstein_url, timeout = 5)\n",
+    "# resp\n",
+    "assert resp.status_code == 200"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "soup = BeautifulSoup(resp.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 125,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<p>\n",
+       "<a href=\"/login\">Login</a>\n",
+       "</p>"
+      ]
+     },
+     "execution_count": 125,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "soup.find()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 126,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'href': '/', 'style': 'text-decoration: none'}"
+      ]
+     },
+     "execution_count": 126,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "soup.a.attrs"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,