Spaces:

Prathmesh48
/

Process-Links

Sleeping

Prathmesh48 commited on May 31, 2024

Commit

37ea6f0

verified ·

1 Parent(s): 73f4358

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import json
 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
 from langchain_community.document_loaders import PyPDFLoader
 from langdetect import detect_langs
 from PyPDF2 import PdfReader
 from io import BytesIO
@@ -65,7 +66,7 @@ def download_pdf(url, timeout=10):
         logging.error(f"PDF download error: {e}")
         return None
-def extract_text_from_pages(pdf_file, pages):
     reader = PdfReader(pdf_file)
     extracted_text = ""
     try:
@@ -80,19 +81,35 @@ def extract_text_from_pages(pdf_file, pages):
         logging.error(f"PDF text extraction error: {e}")
         return 'हे चालत नाही'
 def process_link(link, similar_product):
     if link in seen:
         return None
     seen.add(link)
     try:
-        pdf_file = download_pdf(link)
-        if pdf_file:
-            text = extract_text_from_pages(pdf_file, [0, 2, 4])
-            if language_preprocess(text):
-                if relevant(main_product, similar_product, text):
-                    return link
-    except Exception as e:
-        logging.error(f"Error processing link: {e}")
     return None
 def filtering(urls, similar_product):

 import concurrent.futures
 from concurrent.futures import ThreadPoolExecutor
 from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.document_loaders import WebBaseLoader
 from langdetect import detect_langs
 from PyPDF2 import PdfReader
 from io import BytesIO
         logging.error(f"PDF download error: {e}")
         return None
+def extract_text_from_pdf(pdf_file, pages):
     reader = PdfReader(pdf_file)
     extracted_text = ""
     try:
         logging.error(f"PDF text extraction error: {e}")
         return 'हे चालत नाही'
+def extract_text_online(link):
+    loader = WebBaseLoader(link)
+    pages = loader.load_and_split()
+    text = ''
+    for page in pages[:3]:
+        text+=page.page_content
+    return text
 def process_link(link, similar_product):
     if link in seen:
         return None
     seen.add(link)
     try:
+        if link[-3:]=='.md':
+            text = extract_text_online(link)
+        else:
+            pdf_file = download_pdf(link)
+            text = extract_text_from_pdf(pdf_file, [0, 2, 4])
+        if language_preprocess(text):
+            if relevant(main_product, similar_product, text):
+                return link
+    except:
+        pass
     return None
 def filtering(urls, similar_product):