Spaces:
Sleeping
Sleeping
arcticaurora
commited on
Commit
•
e38e721
1
Parent(s):
0aa99c7
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException
|
2 |
+
from pydantic import BaseModel
|
3 |
+
import requests
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
import json
|
6 |
+
|
7 |
+
app = FastAPI()
|
8 |
+
|
9 |
+
class ArticleScraper:
|
10 |
+
def __init__(self):
|
11 |
+
self.scraper_api_key = "24610cfe7680c5a15d77bd32cfd23fc3"
|
12 |
+
self.scraper_api_url = "http://api.scraperapi.com/"
|
13 |
+
|
14 |
+
def scrape_bloomberg(self, article_url):
|
15 |
+
params = {
|
16 |
+
"api_key": self.scraper_api_key,
|
17 |
+
"url": article_url,
|
18 |
+
}
|
19 |
+
response = requests.get(self.scraper_api_url, params=params)
|
20 |
+
html = response.text
|
21 |
+
soup = BeautifulSoup(html, 'html.parser')
|
22 |
+
script = soup.find('script', {'id': '__NEXT_DATA__'})
|
23 |
+
json_data = json.loads(script.text)
|
24 |
+
props = json_data['props']['pageProps']
|
25 |
+
contents = props['story']['body']['content']
|
26 |
+
|
27 |
+
article_text = []
|
28 |
+
for item in contents:
|
29 |
+
text = self.extract_text(item)
|
30 |
+
if text:
|
31 |
+
article_text.append(text)
|
32 |
+
|
33 |
+
return '\n\n'.join(article_text)
|
34 |
+
|
35 |
+
def scrape_financial_times(self, article_url):
|
36 |
+
headers = {
|
37 |
+
'Referer': 'https://twitter.com'
|
38 |
+
}
|
39 |
+
cookies = {
|
40 |
+
'FTCookieConsentGDPR': 'true',
|
41 |
+
'FTAllocation': '00000000-0000-0000-0000-000000000000'
|
42 |
+
}
|
43 |
+
response = requests.get(article_url, headers=headers, cookies=cookies)
|
44 |
+
|
45 |
+
if response.status_code == 200:
|
46 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
47 |
+
article_script = soup.find('script', {'type': 'application/ld+json'})
|
48 |
+
|
49 |
+
if article_script:
|
50 |
+
article_data = json.loads(article_script.string)
|
51 |
+
return article_data.get('articleBody', '')
|
52 |
+
else:
|
53 |
+
return "Article content not found in the expected format."
|
54 |
+
else:
|
55 |
+
return f"Failed to retrieve the webpage. Status code: {response.status_code}"
|
56 |
+
|
57 |
+
def extract_text(self, content_item):
|
58 |
+
if content_item['type'] == 'paragraph':
|
59 |
+
text_parts = []
|
60 |
+
for item in content_item['content']:
|
61 |
+
if item['type'] == 'text':
|
62 |
+
text_parts.append(item['value'])
|
63 |
+
elif item['type'] == 'entity':
|
64 |
+
if 'link' in item['data'] and item['data']['link']['destination'].get('web'):
|
65 |
+
url = item['data']['link']['destination']['web']
|
66 |
+
text = ' '.join([sub_item['value'] for sub_item in item['content'] if sub_item['type'] == 'text'])
|
67 |
+
text_parts.append(f"[{text}]({url})")
|
68 |
+
else:
|
69 |
+
text_parts.extend([sub_item['value'] for sub_item in item['content'] if sub_item['type'] == 'text'])
|
70 |
+
elif item['type'] == 'link':
|
71 |
+
url = item['data']['destination'].get('web', '')
|
72 |
+
text = ' '.join([sub_item['value'] for sub_item in item['content'] if sub_item['type'] == 'text'])
|
73 |
+
if url:
|
74 |
+
text_parts.append(f"[{text}]({url})")
|
75 |
+
else:
|
76 |
+
text_parts.append(text)
|
77 |
+
return ' '.join(text_parts)
|
78 |
+
elif content_item['type'] == 'entity' and content_item['subType'] == 'story':
|
79 |
+
url = content_item['data']['link']['destination'].get('web', '')
|
80 |
+
text = ' '.join([sub_item['value'] for sub_item in content_item['content'] if sub_item['type'] == 'text'])
|
81 |
+
return f"Read More: [{text}]({url})"
|
82 |
+
elif content_item['type'] == 'media' and content_item['subType'] == 'photo':
|
83 |
+
photo_data = content_item['data']['photo']
|
84 |
+
caption = photo_data.get('caption', '')
|
85 |
+
credit = photo_data.get('credit', '')
|
86 |
+
src = photo_data.get('src', '')
|
87 |
+
alt = photo_data.get('alt', '')
|
88 |
+
return f"\n![{alt}]({src})\n*{caption}* {credit}\n"
|
89 |
+
elif content_item['type'] == 'media' and content_item['subType'] == 'chart':
|
90 |
+
chart_data = content_item['data']['chart']
|
91 |
+
attachment = content_item['data']['attachment']
|
92 |
+
title = attachment.get('title', '')
|
93 |
+
subtitle = attachment.get('subtitle', '')
|
94 |
+
source = attachment.get('source', '')
|
95 |
+
fallback_image = chart_data.get('fallback', '')
|
96 |
+
footnote = attachment.get('footnote', '')
|
97 |
+
return f"\n![{title}]({fallback_image})\n**{title}**\n*{subtitle}*\n{footnote}\n{source}\n"
|
98 |
+
return ''
|
99 |
+
|
100 |
+
def scrape_article(self, url):
|
101 |
+
if 'bloomberg.com' in url:
|
102 |
+
return self.scrape_bloomberg(url)
|
103 |
+
elif 'ft.com' in url:
|
104 |
+
return self.scrape_financial_times(url)
|
105 |
+
else:
|
106 |
+
return "Unsupported website. Please provide a URL from Bloomberg or Financial Times."
|
107 |
+
|
108 |
+
class ArticleRequest(BaseModel):
|
109 |
+
url: str
|
110 |
+
|
111 |
+
@app.post("/scrape_article/")
|
112 |
+
async def scrape_article(request: ArticleRequest):
|
113 |
+
scraper = ArticleScraper()
|
114 |
+
content = scraper.scrape_article(request.url)
|
115 |
+
if "Unsupported website" in content:
|
116 |
+
raise HTTPException(status_code=400, detail=content)
|
117 |
+
return {"content": content}
|
118 |
+
|
119 |
+
if __name__ == "__main__":
|
120 |
+
import uvicorn
|
121 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|