ghosthets commited on
Commit
7d512b6
·
verified ·
1 Parent(s): 067cac7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -10
app.py CHANGED
@@ -3,16 +3,28 @@ import requests
3
  from bs4 import BeautifulSoup
4
  from transformers import pipeline
5
 
6
- # Use open-access model
7
- refiner = pipeline("text2text-generation", model="google/flan-t5-large")
8
 
9
  def refine_from_url(url, instruction):
10
  try:
11
- response = requests.get(url, timeout=5)
12
  soup = BeautifulSoup(response.text, "html.parser")
13
  raw_text = soup.get_text(separator="\n")
14
- prompt = f"{instruction}\n\n{raw_text[:3000]}" # truncate for token limit
15
- output = refiner(prompt, max_new_tokens=512)[0]["generated_text"]
 
 
 
 
 
 
 
 
 
 
 
 
16
  return output
17
  except Exception as e:
18
  return f"Error: {str(e)}"
@@ -20,12 +32,12 @@ def refine_from_url(url, instruction):
20
  demo = gr.Interface(
21
  fn=refine_from_url,
22
  inputs=[
23
- gr.Textbox(label="Enter URL"),
24
- gr.Textbox(label="Refinement Instruction", placeholder="e.g. Clean and structure this for AI training")
25
  ],
26
- outputs=gr.Textbox(label="Refined Output"),
27
- title="🧠 Data Refiner with Flan-T5",
28
- description="Crawl any webpage and refine its content using Flan-T5 for AI training or research."
29
  )
30
 
31
  if __name__ == "__main__":
 
3
  from bs4 import BeautifulSoup
4
  from transformers import pipeline
5
 
6
+ # Load open-access LLM
7
+ llm = pipeline("text2text-generation", model="deepseek-ai/deepseek-coder-6.7b-instruct")
8
 
9
  def refine_from_url(url, instruction):
10
  try:
11
+ response = requests.get(url, timeout=10)
12
  soup = BeautifulSoup(response.text, "html.parser")
13
  raw_text = soup.get_text(separator="\n")
14
+
15
+ prompt = f"""
16
+ You are a data refinement agent. Given the following webpage content, do the following:
17
+ 1. Extract clear headings and structure the content.
18
+ 2. Generate 5 question-answer pairs based on the content.
19
+ 3. Format everything in JSONL style for GPT2 training.
20
+
21
+ Instruction: {instruction}
22
+
23
+ Content:
24
+ {raw_text[:6000]}
25
+ """
26
+
27
+ output = llm(prompt, max_new_tokens=1024)[0]["generated_text"]
28
  return output
29
  except Exception as e:
30
  return f"Error: {str(e)}"
 
32
  demo = gr.Interface(
33
  fn=refine_from_url,
34
  inputs=[
35
+ gr.Textbox(label="🔗 Enter Webpage URL"),
36
+ gr.Textbox(label="🧠 Instruction", placeholder="e.g. Clean and format this for GPT2 training")
37
  ],
38
+ outputs=gr.Textbox(label="📄 Refined JSONL Output"),
39
+ title="🧠 Link-Based Data Refiner + Q&A Generator",
40
+ description="Paste any webpage link. This app will crawl, refine, and generate question-answer pairs using DeepSeek LLM."
41
  )
42
 
43
  if __name__ == "__main__":