Prathmesh48 commited on
Commit
37ea6f0
·
verified ·
1 Parent(s): 73f4358

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -9
app.py CHANGED
@@ -6,6 +6,7 @@ import json
6
  import concurrent.futures
7
  from concurrent.futures import ThreadPoolExecutor
8
  from langchain_community.document_loaders import PyPDFLoader
 
9
  from langdetect import detect_langs
10
  from PyPDF2 import PdfReader
11
  from io import BytesIO
@@ -65,7 +66,7 @@ def download_pdf(url, timeout=10):
65
  logging.error(f"PDF download error: {e}")
66
  return None
67
 
68
- def extract_text_from_pages(pdf_file, pages):
69
  reader = PdfReader(pdf_file)
70
  extracted_text = ""
71
  try:
@@ -80,19 +81,35 @@ def extract_text_from_pages(pdf_file, pages):
80
  logging.error(f"PDF text extraction error: {e}")
81
  return 'हे चालत नाही'
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  def process_link(link, similar_product):
84
  if link in seen:
85
  return None
86
  seen.add(link)
87
  try:
88
- pdf_file = download_pdf(link)
89
- if pdf_file:
90
- text = extract_text_from_pages(pdf_file, [0, 2, 4])
91
- if language_preprocess(text):
92
- if relevant(main_product, similar_product, text):
93
- return link
94
- except Exception as e:
95
- logging.error(f"Error processing link: {e}")
 
 
 
96
  return None
97
 
98
  def filtering(urls, similar_product):
 
6
  import concurrent.futures
7
  from concurrent.futures import ThreadPoolExecutor
8
  from langchain_community.document_loaders import PyPDFLoader
9
+ from langchain_community.document_loaders import WebBaseLoader
10
  from langdetect import detect_langs
11
  from PyPDF2 import PdfReader
12
  from io import BytesIO
 
66
  logging.error(f"PDF download error: {e}")
67
  return None
68
 
69
+ def extract_text_from_pdf(pdf_file, pages):
70
  reader = PdfReader(pdf_file)
71
  extracted_text = ""
72
  try:
 
81
  logging.error(f"PDF text extraction error: {e}")
82
  return 'हे चालत नाही'
83
 
84
+ def extract_text_online(link):
85
+
86
+ loader = WebBaseLoader(link)
87
+ pages = loader.load_and_split()
88
+
89
+ text = ''
90
+
91
+ for page in pages[:3]:
92
+ text+=page.page_content
93
+
94
+ return text
95
+
96
+
97
  def process_link(link, similar_product):
98
  if link in seen:
99
  return None
100
  seen.add(link)
101
  try:
102
+ if link[-3:]=='.md':
103
+ text = extract_text_online(link)
104
+ else:
105
+ pdf_file = download_pdf(link)
106
+ text = extract_text_from_pdf(pdf_file, [0, 2, 4])
107
+
108
+ if language_preprocess(text):
109
+ if relevant(main_product, similar_product, text):
110
+ return link
111
+ except:
112
+ pass
113
  return None
114
 
115
  def filtering(urls, similar_product):