arcticaurora commited on
Commit
e38e721
1 Parent(s): 0aa99c7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -0
app.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import json
6
+
7
+ app = FastAPI()
8
+
9
+ class ArticleScraper:
10
+ def __init__(self):
11
+ self.scraper_api_key = "24610cfe7680c5a15d77bd32cfd23fc3"
12
+ self.scraper_api_url = "http://api.scraperapi.com/"
13
+
14
+ def scrape_bloomberg(self, article_url):
15
+ params = {
16
+ "api_key": self.scraper_api_key,
17
+ "url": article_url,
18
+ }
19
+ response = requests.get(self.scraper_api_url, params=params)
20
+ html = response.text
21
+ soup = BeautifulSoup(html, 'html.parser')
22
+ script = soup.find('script', {'id': '__NEXT_DATA__'})
23
+ json_data = json.loads(script.text)
24
+ props = json_data['props']['pageProps']
25
+ contents = props['story']['body']['content']
26
+
27
+ article_text = []
28
+ for item in contents:
29
+ text = self.extract_text(item)
30
+ if text:
31
+ article_text.append(text)
32
+
33
+ return '\n\n'.join(article_text)
34
+
35
+ def scrape_financial_times(self, article_url):
36
+ headers = {
37
+ 'Referer': 'https://twitter.com'
38
+ }
39
+ cookies = {
40
+ 'FTCookieConsentGDPR': 'true',
41
+ 'FTAllocation': '00000000-0000-0000-0000-000000000000'
42
+ }
43
+ response = requests.get(article_url, headers=headers, cookies=cookies)
44
+
45
+ if response.status_code == 200:
46
+ soup = BeautifulSoup(response.content, 'html.parser')
47
+ article_script = soup.find('script', {'type': 'application/ld+json'})
48
+
49
+ if article_script:
50
+ article_data = json.loads(article_script.string)
51
+ return article_data.get('articleBody', '')
52
+ else:
53
+ return "Article content not found in the expected format."
54
+ else:
55
+ return f"Failed to retrieve the webpage. Status code: {response.status_code}"
56
+
57
+ def extract_text(self, content_item):
58
+ if content_item['type'] == 'paragraph':
59
+ text_parts = []
60
+ for item in content_item['content']:
61
+ if item['type'] == 'text':
62
+ text_parts.append(item['value'])
63
+ elif item['type'] == 'entity':
64
+ if 'link' in item['data'] and item['data']['link']['destination'].get('web'):
65
+ url = item['data']['link']['destination']['web']
66
+ text = ' '.join([sub_item['value'] for sub_item in item['content'] if sub_item['type'] == 'text'])
67
+ text_parts.append(f"[{text}]({url})")
68
+ else:
69
+ text_parts.extend([sub_item['value'] for sub_item in item['content'] if sub_item['type'] == 'text'])
70
+ elif item['type'] == 'link':
71
+ url = item['data']['destination'].get('web', '')
72
+ text = ' '.join([sub_item['value'] for sub_item in item['content'] if sub_item['type'] == 'text'])
73
+ if url:
74
+ text_parts.append(f"[{text}]({url})")
75
+ else:
76
+ text_parts.append(text)
77
+ return ' '.join(text_parts)
78
+ elif content_item['type'] == 'entity' and content_item['subType'] == 'story':
79
+ url = content_item['data']['link']['destination'].get('web', '')
80
+ text = ' '.join([sub_item['value'] for sub_item in content_item['content'] if sub_item['type'] == 'text'])
81
+ return f"Read More: [{text}]({url})"
82
+ elif content_item['type'] == 'media' and content_item['subType'] == 'photo':
83
+ photo_data = content_item['data']['photo']
84
+ caption = photo_data.get('caption', '')
85
+ credit = photo_data.get('credit', '')
86
+ src = photo_data.get('src', '')
87
+ alt = photo_data.get('alt', '')
88
+ return f"\n![{alt}]({src})\n*{caption}* {credit}\n"
89
+ elif content_item['type'] == 'media' and content_item['subType'] == 'chart':
90
+ chart_data = content_item['data']['chart']
91
+ attachment = content_item['data']['attachment']
92
+ title = attachment.get('title', '')
93
+ subtitle = attachment.get('subtitle', '')
94
+ source = attachment.get('source', '')
95
+ fallback_image = chart_data.get('fallback', '')
96
+ footnote = attachment.get('footnote', '')
97
+ return f"\n![{title}]({fallback_image})\n**{title}**\n*{subtitle}*\n{footnote}\n{source}\n"
98
+ return ''
99
+
100
+ def scrape_article(self, url):
101
+ if 'bloomberg.com' in url:
102
+ return self.scrape_bloomberg(url)
103
+ elif 'ft.com' in url:
104
+ return self.scrape_financial_times(url)
105
+ else:
106
+ return "Unsupported website. Please provide a URL from Bloomberg or Financial Times."
107
+
108
+ class ArticleRequest(BaseModel):
109
+ url: str
110
+
111
+ @app.post("/scrape_article/")
112
+ async def scrape_article(request: ArticleRequest):
113
+ scraper = ArticleScraper()
114
+ content = scraper.scrape_article(request.url)
115
+ if "Unsupported website" in content:
116
+ raise HTTPException(status_code=400, detail=content)
117
+ return {"content": content}
118
+
119
+ if __name__ == "__main__":
120
+ import uvicorn
121
+ uvicorn.run(app, host="0.0.0.0", port=7860)