Spaces:

limitedonly41
/

website_topic_classify

Build error

App Files Files Community

limitedonly41 commited on Nov 21, 2024

Commit

2891c11

verified ·

1 Parent(s): c885544

Update app.py

Browse files

Files changed (1) hide show

app.py +266 -107

app.py CHANGED Viewed

@@ -1,22 +1,28 @@
 import gradio as gr
-import asyncio
-import requests
-from bs4 import BeautifulSoup
 import pandas as pd
 from tqdm import tqdm
 import urllib
-from deep_translator import GoogleTranslator
-import spaces
-# from unsloth import FastLanguageModel
-import torch
-import re
-# Define helper functions
-async def fetch_data(url):
     headers = {
         'Accept': '*/*',
         'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
@@ -32,38 +38,33 @@ async def fetch_data(url):
     }
     encoding = 'utf-8'
-    timeout = 10
     try:
-        def get_content():
-            req = urllib.request.Request(url, headers=headers)
-            with urllib.request.urlopen(req, timeout=timeout) as response:
-                return response.read()
-        response_content = await asyncio.get_event_loop().run_in_executor(None, get_content)
         soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
         title = soup.find('title').text
         description = soup.find('meta', attrs={'name': 'description'})
-        if description and "content" in description.attrs:
-            description = description.get("content")
-        else:
-            description = ""
         keywords = soup.find('meta', attrs={'name': 'keywords'})
-        if keywords and "content" in keywords.attrs:
-            keywords = keywords.get("content")
-        else:
-            keywords = ""
-        h1_all = " ".join(h.text for h in soup.find_all('h1'))
-        h2_all = " ".join(h.text for h in soup.find_all('h2'))
-        h3_all = " ".join(h.text for h in soup.find_all('h3'))
-        paragraphs_all = " ".join(p.text for p in soup.find_all('p'))
-        allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"
-        allthecontent = allthecontent[:4999]
         return {
             'url': url,
@@ -77,6 +78,7 @@ async def fetch_data(url):
             'text': allthecontent
         }
     except Exception as e:
         return {
             'url': url,
             'title': None,
@@ -89,45 +91,25 @@ async def fetch_data(url):
             'text': None
         }
-def concatenate_text(data):
-    text_parts = [str(data[col]) for col in ['url', 'title', 'description', 'keywords', 'h1', 'h2', 'h3'] if data[col]]
-    text = ' '.join(text_parts)
-    text = text.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
-    text = re.sub(r'\s{2,}', ' ', text)
-    return text
-def translate_text(text):
-    try:
-        text = text[:4990]
-        translated_text = GoogleTranslator(source='auto', target='en').translate(text)
-        return translated_text
-    except Exception as e:
-        print(f"An error occurred during translation: {e}")
-        return None
-model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
-# Initialize model and tokenizer variables
-model = None
-tokenizer = None
 @spaces.GPU()
-def summarize_url(url):
     global model, tokenizer  # Declare model and tokenizer as global variables
-    # Load the model
-    max_seq_length = 2048
-    dtype = None
-    load_in_4bit = True
     if model is None or tokenizer is None:
-        from unsloth import FastLanguageModel
-        # Load the model and tokenizer
         model, tokenizer = FastLanguageModel.from_pretrained(
-            model_name=model_name,  # YOUR MODEL YOU USED FOR TRAINING
             max_seq_length=max_seq_length,
             dtype=dtype,
             load_in_4bit=load_in_4bit,
@@ -135,11 +117,14 @@ def summarize_url(url):
         FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
-    result = asyncio.run(fetch_data(url))
-    text = concatenate_text(result)
-    translated_text = translate_text(text)
-    if len(translated_text) < 100:
-        return 'not scraped or short text'
     alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
     ### Instruction:
@@ -151,7 +136,7 @@ def summarize_url(url):
     ### Response:
     """
-    prompt = alpaca_prompt.format(translated_text)
     inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
     outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
@@ -159,55 +144,229 @@ def summarize_url(url):
     final_answer = summary.split("### Response:")[1].strip()
     return final_answer
-# # Create the Gradio interface within a `Blocks` context, like the working example
-# with gr.Blocks() as demo:
-#     # Add title and description to the interface
-#     gr.HTML("<h1>Website Summary Generator</h1>")
-#     gr.HTML("<p>Enter a URL to get a one-word topic summary of the website content..</p>")
-#     # Define input and output elements
-#     with gr.Row():
-#         prompt = gr.Textbox(label="Enter Website URL", placeholder="https://example.com")
-#         output_text = gr.Textbox(label="Topic", interactive=False)
-#     # Add the button to trigger the function
-#     submit = gr.Button("Classify")
-#     # Define the interaction between inputs and outputs
-#     submit.click(fn=summarize_url, inputs=prompt, outputs=output_text)
-# # Add the `if __name__ == "__main__":` block to launch the interface
-# if __name__ == "__main__":
-#     demo.launch()
-# with gr as demo:
-#     # Define Gradio interface
-#     demo = demo.Interface(
-#         fn=summarize_url,
-#         inputs="text",
-#         outputs="text",
-#         title="Website Summary Generator",
-#         description="Enter a URL to get a one-word topic summary of the website content."
-#     )
-# if __name__ == "__main__":
-#     demo.launch()
-# Create a Gradio interface
-iface = gr.Interface(
-    fn=summarize_url,
-    inputs="text",
-    outputs="text",
-    title="Website Summary Generator",
-    description="Enter a URL to get a one-word topic summary of the website content."
-)
-# Launch the interface
-iface.launch()

 import gradio as gr
+import torch
+import spaces
+import logging
+from deep_translator import GoogleTranslator
 import pandas as pd
 from tqdm import tqdm
 import urllib
+from bs4 import BeautifulSoup
+# Configure logging to write messages to a file
+logging.basicConfig(filename='app.log', level=logging.ERROR)
+# Configuration
+max_seq_length = 2048
+dtype = None  # Auto detection of dtype
+load_in_4bit = True  # Use 4-bit quantization to reduce memory usage
+# peft_model_name = "limitedonly41/website_qwen2_7b_2"
+peft_model_name = "limitedonly41/website_mistral7b_v02"
+# Initialize model and tokenizer variables
+model = None
+tokenizer = None
+def fetch_data(url):
     headers = {
         'Accept': '*/*',
         'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
     }
     encoding = 'utf-8'
+    timeout = 10  # Set your desired timeout value in seconds
     try:
+        # Make the request using urllib
+        req = urllib.request.Request(url, headers=headers)
+        with urllib.request.urlopen(req, timeout=timeout) as response:
+            response_content = response.read()
         soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
         title = soup.find('title').text
         description = soup.find('meta', attrs={'name': 'description'})
+        description = description.get("content") if description and "content" in description.attrs else ""
         keywords = soup.find('meta', attrs={'name': 'keywords'})
+        keywords = keywords.get("content") if keywords and "content" in keywords.attrs else ""
+        h1_all = ". ".join(h.text for h in soup.find_all('h1'))
+        paragraphs_all = ". ".join(p.text for p in soup.find_all('p'))
+        h2_all = ". ".join(h.text for h in soup.find_all('h2'))
+        h3_all = ". ".join(h.text for h in soup.find_all('h3'))
+        allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"[:4999]
+        # Clean up the text
+        h1_all = h1_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
+        h2_all = h2_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
+        h3_all = h3_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
         return {
             'url': url,
             'text': allthecontent
         }
     except Exception as e:
+        print(url, e)
         return {
             'url': url,
             'title': None,
             'text': None
         }
+def main(urls):
+    results = []
+    for url in tqdm(urls):
+        result = fetch_data(url)
+        results.append(result)
+    return results
 @spaces.GPU()
+def classify_website(url):
+    from unsloth import FastLanguageModel  # Import moved to the top for model loading
     global model, tokenizer  # Declare model and tokenizer as global variables
     if model is None or tokenizer is None:
+        # Load the model and tokenizer during initialization (in the main process)
         model, tokenizer = FastLanguageModel.from_pretrained(
+            model_name=peft_model_name,
             max_seq_length=max_seq_length,
             dtype=dtype,
             load_in_4bit=load_in_4bit,
         FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
+    urls = [url]
+    results_shop = main(urls)
+    # Convert results to DataFrame
+    df_result_train_more = pd.DataFrame(results_shop)
+    text = df_result_train_more['text'][0]
+    translated = GoogleTranslator(source='auto', target='en').translate(text[:4990])
     alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
     ### Instruction:
     ### Response:
     """
+    prompt = alpaca_prompt.format(translated)
     inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
     outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
     final_answer = summary.split("### Response:")[1].strip()
     return final_answer
+# Create a Gradio interface
+iface = gr.Interface(
+    fn=classify_website,
+    inputs="text",
+    outputs="text",
+    title="Website Topic",
+    description="Enter a URL to get a topic summary of the website content."
+)
+# Launch the interface
+iface.launch()
+# import gradio as gr
+# import asyncio
+# import requests
+# from bs4 import BeautifulSoup
+# import pandas as pd
+# from tqdm import tqdm
+# import urllib
+# from deep_translator import GoogleTranslator
+# import spaces
+# # from unsloth import FastLanguageModel
+# import torch
+# import re
+# # Define helper functions
+# async def fetch_data(url):
+#     headers = {
+#         'Accept': '*/*',
+#         'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
+#         'Connection': 'keep-alive',
+#         'Referer': f'{url}',
+#         'Sec-Fetch-Dest': 'empty',
+#         'Sec-Fetch-Mode': 'cors',
+#         'Sec-Fetch-Site': 'cross-site',
+#         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
+#         'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
+#         'sec-ch-ua-mobile': '?0',
+#         'sec-ch-ua-platform': '"macOS"',
+#     }
+#     encoding = 'utf-8'
+#     timeout = 10
+#     try:
+#         def get_content():
+#             req = urllib.request.Request(url, headers=headers)
+#             with urllib.request.urlopen(req, timeout=timeout) as response:
+#                 return response.read()
+#         response_content = await asyncio.get_event_loop().run_in_executor(None, get_content)
+#         soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
+#         title = soup.find('title').text
+#         description = soup.find('meta', attrs={'name': 'description'})
+#         if description and "content" in description.attrs:
+#             description = description.get("content")
+#         else:
+#             description = ""
+#         keywords = soup.find('meta', attrs={'name': 'keywords'})
+#         if keywords and "content" in keywords.attrs:
+#             keywords = keywords.get("content")
+#         else:
+#             keywords = ""
+#         h1_all = " ".join(h.text for h in soup.find_all('h1'))
+#         h2_all = " ".join(h.text for h in soup.find_all('h2'))
+#         h3_all = " ".join(h.text for h in soup.find_all('h3'))
+#         paragraphs_all = " ".join(p.text for p in soup.find_all('p'))
+#         allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"
+#         allthecontent = allthecontent[:4999]
+#         return {
+#             'url': url,
+#             'title': title,
+#             'description': description,
+#             'keywords': keywords,
+#             'h1': h1_all,
+#             'h2': h2_all,
+#             'h3': h3_all,
+#             'paragraphs': paragraphs_all,
+#             'text': allthecontent
+#         }
+#     except Exception as e:
+#         return {
+#             'url': url,
+#             'title': None,
+#             'description': None,
+#             'keywords': None,
+#             'h1': None,
+#             'h2': None,
+#             'h3': None,
+#             'paragraphs': None,
+#             'text': None
+#         }
+# def concatenate_text(data):
+#     text_parts = [str(data[col]) for col in ['url', 'title', 'description', 'keywords', 'h1', 'h2', 'h3'] if data[col]]
+#     text = ' '.join(text_parts)
+#     text = text.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
+#     text = re.sub(r'\s{2,}', ' ', text)
+#     return text
+# def translate_text(text):
+#     try:
+#         text = text[:4990]
+#         translated_text = GoogleTranslator(source='auto', target='en').translate(text)
+#         return translated_text
+#     except Exception as e:
+#         print(f"An error occurred during translation: {e}")
+#         return None
+# model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
+# # Initialize model and tokenizer variables
+# model = None
+# tokenizer = None
+# @spaces.GPU()
+# def summarize_url(url):
+#     global model, tokenizer  # Declare model and tokenizer as global variables
+#     # Load the model
+#     max_seq_length = 2048
+#     dtype = None
+#     load_in_4bit = True
+#     if model is None or tokenizer is None:
+#         from unsloth import FastLanguageModel
+#         # Load the model and tokenizer
+#         model, tokenizer = FastLanguageModel.from_pretrained(
+#             model_name=model_name,  # YOUR MODEL YOU USED FOR TRAINING
+#             max_seq_length=max_seq_length,
+#             dtype=dtype,
+#             load_in_4bit=load_in_4bit,
+#         )
+#         FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
+#     result = asyncio.run(fetch_data(url))
+#     text = concatenate_text(result)
+#     translated_text = translate_text(text)
+#     if len(translated_text) < 100:
+#         return 'not scraped or short text'
+#     alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+#     ### Instruction:
+#     Describe the website text into one word topic:
+#     ### Input:
+#     {}
+#     ### Response:
+#     """
+#     prompt = alpaca_prompt.format(translated_text)
+#     inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+#     outputs = model.generate(inputs.input_ids, max_new_tokens=64, use_cache=True)
+#     summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
+#     final_answer = summary.split("### Response:")[1].strip()
+#     return final_answer
+# # # Create the Gradio interface within a `Blocks` context, like the working example
+# # with gr.Blocks() as demo:
+# #     # Add title and description to the interface
+# #     gr.HTML("<h1>Website Summary Generator</h1>")
+# #     gr.HTML("<p>Enter a URL to get a one-word topic summary of the website content..</p>")
+# #     # Define input and output elements
+# #     with gr.Row():
+# #         prompt = gr.Textbox(label="Enter Website URL", placeholder="https://example.com")
+# #         output_text = gr.Textbox(label="Topic", interactive=False)
+# #     # Add the button to trigger the function
+# #     submit = gr.Button("Classify")
+# #     # Define the interaction between inputs and outputs
+# #     submit.click(fn=summarize_url, inputs=prompt, outputs=output_text)
+# # # Add the `if __name__ == "__main__":` block to launch the interface
+# # if __name__ == "__main__":
+# #     demo.launch()
+# # with gr as demo:
+# #     # Define Gradio interface
+# #     demo = demo.Interface(
+# #         fn=summarize_url,
+# #         inputs="text",
+# #         outputs="text",
+# #         title="Website Summary Generator",
+# #         description="Enter a URL to get a one-word topic summary of the website content."
+# #     )
+# # if __name__ == "__main__":
+# #     demo.launch()
+# # Create a Gradio interface
+# iface = gr.Interface(
+#     fn=summarize_url,
+#     inputs="text",
+#     outputs="text",
+#     title="Website Summary Generator",
+#     description="Enter a URL to get a one-word topic summary of the website content."
+# )
+# # Launch the interface
+# iface.launch()