Spaces:
Sleeping
Sleeping
Mustehson
commited on
Commit
·
bb41ea7
1
Parent(s):
2833068
Added Langsmith
Browse files- app.py +4 -2
- requirements.txt +6 -5
app.py
CHANGED
@@ -8,6 +8,8 @@ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
|
8 |
from langchain_community.document_loaders import RecursiveUrlLoader
|
9 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
10 |
from langchain_community.document_transformers import Html2TextTransformer
|
|
|
|
|
11 |
|
12 |
|
13 |
TAB_LINES = 22
|
@@ -52,7 +54,7 @@ def scrape_text(url, max_depth):
|
|
52 |
return None
|
53 |
return documents
|
54 |
|
55 |
-
|
56 |
def clean_text(docs):
|
57 |
html2text = Html2TextTransformer()
|
58 |
docs_transformed = html2text.transform_documents(docs)
|
@@ -93,7 +95,7 @@ def format_page_content(docs):
|
|
93 |
formatted_docs += "\n\n---\n\n"
|
94 |
return formatted_docs
|
95 |
|
96 |
-
|
97 |
def get_tables(raw_docs):
|
98 |
tables_list = []
|
99 |
for raw_doc in raw_docs:
|
|
|
8 |
from langchain_community.document_loaders import RecursiveUrlLoader
|
9 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
10 |
from langchain_community.document_transformers import Html2TextTransformer
|
11 |
+
from langsmith import traceable
|
12 |
+
|
13 |
|
14 |
|
15 |
TAB_LINES = 22
|
|
|
54 |
return None
|
55 |
return documents
|
56 |
|
57 |
+
@traceable()
|
58 |
def clean_text(docs):
|
59 |
html2text = Html2TextTransformer()
|
60 |
docs_transformed = html2text.transform_documents(docs)
|
|
|
95 |
formatted_docs += "\n\n---\n\n"
|
96 |
return formatted_docs
|
97 |
|
98 |
+
@traceable()
|
99 |
def get_tables(raw_docs):
|
100 |
tables_list = []
|
101 |
for raw_doc in raw_docs:
|
requirements.txt
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
-
pandas
|
2 |
-
langchain
|
3 |
-
langchain-community
|
4 |
langchain-text-splitters
|
5 |
html2text
|
6 |
lxml
|
7 |
beautifulsoup4
|
8 |
html5lib
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
langchain-text-splitters
|
2 |
html2text
|
3 |
lxml
|
4 |
beautifulsoup4
|
5 |
html5lib
|
6 |
+
pandas==2.2.2
|
7 |
+
langchain==0.3.3
|
8 |
+
langchain-community==0.3.2
|
9 |
+
langsmith==0.1.135
|
10 |
+
duckdb==1.1.1
|
11 |
+
sentence_transformers==3.2.0
|