Mustehson commited on
Commit
a831d50
1 Parent(s): fda45ca

Scrape&Clean Data

Browse files
Files changed (3) hide show
  1. app.py +124 -59
  2. logo.png +0 -0
  3. requirements.txt +8 -1
app.py CHANGED
@@ -1,63 +1,128 @@
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
- """
43
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
44
- """
45
- demo = gr.ChatInterface(
46
- respond,
47
- additional_inputs=[
48
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
49
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
- gr.Slider(
52
- minimum=0.1,
53
- maximum=1.0,
54
- value=0.95,
55
- step=0.05,
56
- label="Top-p (nucleus sampling)",
57
- ),
58
- ],
59
- )
60
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  if __name__ == "__main__":
63
- demo.launch()
 
 
 
 
 
 
1
+ import re
2
  import gradio as gr
3
+ from io import StringIO
4
+ import pandas as pd
5
+ from langchain_community.document_loaders import RecursiveUrlLoader
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+ from langchain_community.document_transformers import Html2TextTransformer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
+ TAB_LINES = 22
11
+
12
+
13
+ def scrape_text(url):
14
+ try:
15
+ loader = RecursiveUrlLoader(url=url, max_depth=None,
16
+ prevent_outside=True, check_response_status=True)
17
+ documents = loader.load()
18
+ except Exception as e:
19
+ print(f"Error loading URL: {e}")
20
+ return None
21
+ return documents
22
+
23
+
24
+ def clean_text(documents):
25
+ html2text = Html2TextTransformer()
26
+ docs_transformed = html2text.transform_documents([documents])
27
+ cleaned_string = re.sub(r'\n\n+|\n+|\s+', ' ', docs_transformed[0].page_content)
28
+ docs_transformed[0].page_content = cleaned_string
29
+ return docs_transformed
30
+
31
+
32
+ def remove_tables(docs):
33
+ table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL)
34
+ docs.page_content = table_pattern.sub('', docs.page_content)
35
+ return docs
36
+
37
+
38
+ def format_chunks_with_spaces(chunks):
39
+ separator = "\n\n---\n\n"
40
+ formatted_chunks = ""
41
+ for i, chunk in enumerate(chunks):
42
+ formatted_chunks += f"Chunk {i+1}: \n\n"
43
+ formatted_chunks += chunk.page_content
44
+ formatted_chunks += separator
45
+ return formatted_chunks
46
+
47
+
48
+ def get_tables(raw_html):
49
+ try:
50
+ tables = pd.read_html(StringIO(str(raw_html.page_content)))
51
+ except Exception as e:
52
+ print(f"Error reading table: {e}")
53
+ return None
54
+ return tables
55
+
56
+
57
+ def concat_dfs(df_list):
58
+ concatenated_df = pd.concat(df_list, ignore_index=True)
59
+ return concatenated_df
60
+
61
+
62
+ def get_docs(url):
63
+ raw_html = scrape_text(url)
64
+ if raw_html is None:
65
+ return None, None, None, None, None
66
+
67
+ tables_list = get_tables(raw_html[0])
68
+
69
+ if tables_list is not None:
70
+ concat_tables = concat_dfs(tables_list)
71
+ else:
72
+ concat_tables = None
73
+
74
+ tables_rmv_html = remove_tables(raw_html[0])
75
+ clean_docs = clean_text(tables_rmv_html)
76
+
77
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
78
+ documents_splits = text_splitter.split_documents(clean_docs)
79
+ formatted_chunks = format_chunks_with_spaces(documents_splits)
80
+
81
+ return raw_html[0].page_content, clean_docs[0].page_content, concat_tables, raw_html[0].metadata, formatted_chunks
82
+
83
+
84
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo:
85
+
86
+ gr.Image("logo.png", label=None, show_label=False, container=False, height=100)
87
+
88
+ gr.Markdown("""
89
+ <div style='text-align: center;'>
90
+ <strong style='font-size: 36px;'>Domain Document Indexing</strong>
91
+
92
+ </div>
93
+ """)
94
+
95
+ with gr.Row():
96
+ with gr.Column(scale=1):
97
+ url_input = gr.Textbox(lines=5, label="URL", placeholder="Enter your URL here...")
98
+ scarpe_url_button = gr.Button(value="Scrape & Create Embeddings", variant="primary")
99
+
100
+ with gr.Column(elem_id = "col_container", scale=2):
101
+ with gr.Tabs():
102
+ with gr.Tab("RAW HTML"):
103
+ raw_page_content = gr.Textbox(lines=TAB_LINES, label="Page Content HTML", value="", interactive=False,
104
+ autoscroll=False)
105
+ with gr.Tab("Clean Content"):
106
+ page_content = gr.Textbox(lines=TAB_LINES, label="Clean Page Content", value="", interactive=False,
107
+ autoscroll=False)
108
+ with gr.Tab("Tables"):
109
+ tables = gr.Textbox(lines=TAB_LINES, label="Tables", value="", interactive=False,
110
+ autoscroll=False)
111
+ with gr.Tab("Chunks"):
112
+ parsed_chunks = gr.Textbox(lines=TAB_LINES, label="Parsed Chunks", value="", interactive=False,
113
+ autoscroll=False)
114
+ with gr.Tab("Metadata"):
115
+ metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False,
116
+ autoscroll=False)
117
+
118
+ scarpe_url_button.click(get_docs, inputs=url_input, outputs=[raw_page_content, page_content, tables,
119
+ metadata, parsed_chunks])
120
+
121
+
122
  if __name__ == "__main__":
123
+ demo.launch()
124
+
125
+
126
+
127
+
128
+
logo.png ADDED
requirements.txt CHANGED
@@ -1 +1,8 @@
1
- huggingface_hub==0.22.2
 
 
 
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ langchain
4
+ langchain-community
5
+ langchain-text-splitters
6
+ html2text
7
+ lxml
8
+ beautifulsoup4