Arafath10 commited on
Commit
920f2ce
·
verified ·
1 Parent(s): 838f637

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +47 -13
main.py CHANGED
@@ -7,12 +7,6 @@ from pydantic import BaseModel
7
  from io import StringIO
8
  import os
9
 
10
- from llmlingua import PromptCompressor
11
- llm_lingua = PromptCompressor(
12
- model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
13
- use_llmlingua2=True, # Whether to use llmlingua-2
14
- device_map="cpu"
15
- )
16
 
17
 
18
  app = FastAPI()
@@ -24,11 +18,51 @@ app.add_middleware(
24
  allow_headers=["*"],
25
  )
26
 
27
- class Prompt(BaseModel):
28
- original_prompt: str
29
- @app.post("/get_compressed_text")
30
- async def get_compressed_text(prompt: Prompt):
31
- compressed_prompt = llm_lingua.compress_prompt(prompt.original_prompt, instruction="", question="")
32
- print("compressed")
33
- return compressed_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
 
7
  from io import StringIO
8
  import os
9
 
 
 
 
 
 
 
10
 
11
 
12
  app = FastAPI()
 
18
  allow_headers=["*"],
19
  )
20
 
21
+ import nest_asyncio
22
+ import asyncio
23
+ from playwright.async_api import async_playwright
24
+
25
+ # Apply nest_asyncio to allow nested asyncio.run() calls
26
+ nest_asyncio.apply()
27
+
28
+ async def scrape_links():
29
+ async with async_playwright() as p:
30
+ browser = await p.chromium.launch(headless=True)
31
+ page = await browser.new_page()
32
+
33
+ # Block unnecessary resources to speed up loading
34
+ await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
35
+
36
+ # Open the target website
37
+ await page.goto('https://www.fool.com/earnings/call-transcripts/2024/01/24/tesla-tsla-q4-2023-earnings-call-transcript/', wait_until='domcontentloaded')
38
+
39
+ # Wait for a short time to ensure dynamic content is loaded
40
+ await page.wait_for_timeout(10)
41
+
42
+ # Extract all links
43
+ links = await page.query_selector_all('a')
44
+ result = []
45
+ for link in links:
46
+ href = await link.get_attribute('href')
47
+ result.append({'href': href})
48
+
49
+ # Extract all text content
50
+ elements = await page.query_selector_all('body *')
51
+
52
+ for element in elements:
53
+ text_content = await element.text_content()
54
+ if text_content and text_content.strip():
55
+ result.append({'text': text_content.strip()})
56
+
57
+ await browser.close()
58
+ return result
59
+
60
+
61
+
62
+ @app.post("/get_webscrapet_data")
63
+ async def get_webscrapet_data(url):
64
+ # Run the scraping function
65
+ results = asyncio.run(scrape_links())
66
+ print(results)
67
+ return results
68