Enhance document loading with alternative method using BeautifulSoup and update requirements for requests and beautifulsoup4
Browse files- api/fastapi_server.py +25 -3
- requirements.txt +2 -1
api/fastapi_server.py
CHANGED
|
@@ -30,6 +30,8 @@ import traceback
|
|
| 30 |
from typing import Dict, List, Optional
|
| 31 |
from pydantic import BaseModel
|
| 32 |
from huggingface_hub import Repository, snapshot_download
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# Initialize environment variables
|
| 35 |
load_dotenv()
|
|
@@ -232,15 +234,35 @@ def build_knowledge_base():
|
|
| 232 |
# Create folder in advance
|
| 233 |
os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
|
| 234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
# Load documents with detailed logging
|
| 236 |
for url in URLS:
|
| 237 |
try:
|
| 238 |
print(f"Attempting to load {url}")
|
| 239 |
-
loader = WebBaseLoader(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
docs = loader.load()
|
| 241 |
print(f"Successfully loaded {url}, got {len(docs)} documents")
|
| 242 |
-
|
| 243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
except Exception as e:
|
| 245 |
print(f"Failed to load {url}: {str(e)}")
|
| 246 |
print(f"Full error: {traceback.format_exc()}")
|
|
|
|
| 30 |
from typing import Dict, List, Optional
|
| 31 |
from pydantic import BaseModel
|
| 32 |
from huggingface_hub import Repository, snapshot_download
|
| 33 |
+
import requests
|
| 34 |
+
from bs4 import BeautifulSoup
|
| 35 |
|
| 36 |
# Initialize environment variables
|
| 37 |
load_dotenv()
|
|
|
|
| 234 |
# Create folder in advance
|
| 235 |
os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
|
| 236 |
|
| 237 |
+
headers = {
|
| 238 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
# Load documents with detailed logging
|
| 242 |
for url in URLS:
|
| 243 |
try:
|
| 244 |
print(f"Attempting to load {url}")
|
| 245 |
+
loader = WebBaseLoader(
|
| 246 |
+
web_paths=[url],
|
| 247 |
+
header_template=headers,
|
| 248 |
+
requests_per_second=2,
|
| 249 |
+
timeout=30
|
| 250 |
+
)
|
| 251 |
docs = loader.load()
|
| 252 |
print(f"Successfully loaded {url}, got {len(docs)} documents")
|
| 253 |
+
if docs:
|
| 254 |
+
documents.extend(docs)
|
| 255 |
+
else:
|
| 256 |
+
# Попробуем альтернативный метод загрузки
|
| 257 |
+
response = requests.get(url, headers=headers, timeout=30)
|
| 258 |
+
response.raise_for_status()
|
| 259 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 260 |
+
# Получаем основной контент, исключая навигацию и футер
|
| 261 |
+
main_content = ' '.join([p.text for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])])
|
| 262 |
+
if main_content:
|
| 263 |
+
from langchain_core.documents import Document
|
| 264 |
+
documents.append(Document(page_content=main_content, metadata={"source": url}))
|
| 265 |
+
print(f"Loaded {url} using alternative method")
|
| 266 |
except Exception as e:
|
| 267 |
print(f"Failed to load {url}: {str(e)}")
|
| 268 |
print(f"Full error: {traceback.format_exc()}")
|
requirements.txt
CHANGED
|
@@ -13,4 +13,5 @@ huggingface_hub>=0.19.0
|
|
| 13 |
jinja2>=3.0.0
|
| 14 |
aiofiles>=0.8.0
|
| 15 |
python-multipart>=0.0.6
|
| 16 |
-
|
|
|
|
|
|
| 13 |
jinja2>=3.0.0
|
| 14 |
aiofiles>=0.8.0
|
| 15 |
python-multipart>=0.0.6
|
| 16 |
+
beautifulsoup4>=4.12.0
|
| 17 |
+
requests>=2.31.0
|