GitHub Actions Bot commited on
Commit
c48903e
·
0 Parent(s):

Changes from ggruber193/polars-docu-chat-rag

Browse files
app.py ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ beautifulsoup4~=4.13.4
2
+ markdown~=3.8
3
+ langchain~=0.3.23
4
+ transformers~=4.51.3
5
+ torch~=2.6.0
6
+ tqdm~=4.67.1
7
+ qdrant_client
src/config.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EMBEDDING_MODEL = "thenlper/gte-small"
2
+
3
+ QDRANT_COLLECTION_NAME = "polars-documentation"
4
+
5
+ def get_qdrant_config():
6
+ from qdrant_client import models
7
+ QDRANT_COLLECTION_CONFIG = {
8
+ "collection_name": QDRANT_COLLECTION_NAME,
9
+ "vectors_config": models.VectorParams(size=384, distance=models.Distance.COSINE), # on_disk=True),
10
+ # "hnsw_config": models.HnswConfigDiff(on_disk=True)
11
+ }
12
+ return QDRANT_COLLECTION_CONFIG
src/data_processing/embeddings.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModel, AutoTokenizer
2
+ from torch import Tensor
3
+ from torch import functional as F
4
+
5
+ from src.config import EMBEDDING_MODEL
6
+ from src.utils import batched
7
+
8
+
9
+ class TextEmbedder:
10
+ def __init__(self, modelname=EMBEDDING_MODEL, max_length=512):
11
+ self.tokenizer = AutoTokenizer.from_pretrained(modelname)
12
+ self.model = AutoModel.from_pretrained(modelname)
13
+ self.max_length = max_length
14
+
15
+ @staticmethod
16
+ def average_pool(last_hidden_states: Tensor,
17
+ attention_mask: Tensor) -> Tensor:
18
+ last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
19
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
20
+
21
+ def embed_text(self, text: str | list[str], batch_size=128):
22
+ if isinstance(text, str):
23
+ text = [text]
24
+
25
+ outputs = []
26
+
27
+ for batch in batched(text, n=batch_size):
28
+ batch_dict = self.tokenizer(batch, max_length=self.max_length, padding=True, truncation=True, return_tensors='pt')
29
+ output = self.model(**batch_dict)
30
+ embeddings = self.average_pool(output.last_hidden_state, batch_dict['attention_mask'])
31
+
32
+ # embeddings = F.norm(embeddings, p=2, dim=1)
33
+ # scores = (embeddings[:1] @ embeddings[1:].T) * 100
34
+
35
+ embeddings = embeddings.tolist()
36
+ outputs += embeddings
37
+ return outputs
src/data_processing/process_markdown.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ from bs4 import BeautifulSoup
4
+ from langchain_core.documents import Document
5
+ from markdown import markdown
6
+ from pathlib import Path
7
+ from langchain.text_splitter import MarkdownTextSplitter, MarkdownHeaderTextSplitter, TextSplitter
8
+
9
+ from src.utils import batched
10
+
11
+
12
+ def read_markdown_file(path: str | Path) -> [str, str]:
13
+ path = Path(path)
14
+ with open(path, 'r', encoding="utf8") as f_r:
15
+ text = f_r.read()
16
+
17
+ # text = markdown(text)
18
+ # text = ''.join(BeautifulSoup(text).findAll(text=True))
19
+ return text, str(path)
20
+
21
+
22
+ def split_markdown(md: str | list[str],
23
+ metadata=dict[str, Any] | list[dict[str, Any]],
24
+ chunk_size=512,
25
+ overlap=64,
26
+ splitter: TextSplitter = None) -> list[Document]:
27
+ if isinstance(md, str):
28
+ md = [md]
29
+ if isinstance(metadata, list):
30
+ raise ValueError("metadata should be a single dict")
31
+ metadata = [metadata]
32
+ if splitter is None:
33
+ headers_to_split_on = [
34
+ ("#", "Header 1"),
35
+ ("##", "Header 2"),
36
+ ("###", "Header 3"),
37
+ ]
38
+ md = [MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False).split_text(i) for i in md]
39
+ metadata = [{**metadata[i], **text.metadata} for i, text_split in enumerate(md) for text in text_split]
40
+ md = [j.page_content for i in md for j in i]
41
+ splitter = MarkdownTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
42
+
43
+ docs = splitter.create_documents(md, metadata)
44
+ return docs
45
+
46
+
47
+ def process_markdown_files(paths: list[str | Path], batch_size=1, chunk_size=512, overlap=64):
48
+ for files in batched(paths, batch_size):
49
+ mds_w_paths = [read_markdown_file(i) for i in files]
50
+ metadata = [{"path": md_path} for _, md_path in mds_w_paths]
51
+ md = [md for md, _ in mds_w_paths]
52
+ docs = split_markdown(md, metadata, chunk_size=chunk_size, overlap=overlap)
53
+ yield [i.page_content for i in docs], [i.metadata for i in docs]
src/data_processing/upload_to_qdrant.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ from qdrant_client import QdrantClient, models
4
+ from uuid import uuid4
5
+
6
+ from src.config import QDRANT_COLLECTION_NAME
7
+
8
+
9
+ class QdrantStore:
10
+ def __init__(self, client: QdrantClient, collection_config=None):
11
+ self.client = client
12
+ self.collection_names = set([i.name for i in client.get_collections().collections])
13
+
14
+ if collection_config is not None:
15
+ self.create_collection(collection_config)
16
+
17
+ def create_collection(self, collection_config: dict):
18
+ collection_name = collection_config["collection_name"]
19
+ if not self.client.collection_exists(collection_name):
20
+ self.client.create_collection(**collection_config)
21
+ self.collection_names.add(collection_name)
22
+
23
+ def _check_collection_name(self, collection_name):
24
+ if collection_name not in self.collection_names:
25
+ raise ValueError(f"Collection: {collection_name} does not exist.")
26
+
27
+ def upsert_points(self,
28
+ vectors: Any | list[Any],
29
+ payloads: dict | list[dict],
30
+ collection_name: str):
31
+ self._check_collection_name(collection_name)
32
+
33
+ ids = [str(uuid4()) for _ in payloads]
34
+
35
+ self.client.upsert(
36
+ collection_name=collection_name,
37
+ points=models.Batch(
38
+ ids=ids,
39
+ payloads=payloads,
40
+ vectors=vectors
41
+ )
42
+ )
43
+
44
+ def delete_points(self,
45
+ filters: dict[str, list[models.FieldCondition]],
46
+ collection_name: str):
47
+ self._check_collection_name(collection_name)
48
+
49
+ self.client.delete(
50
+ collection_name=collection_name,
51
+ points_selector=models.Filter(**filters)
52
+ )
53
+
54
+ def delete_points_by_match(self,
55
+ key_value: tuple[str, list[str] | str],
56
+ collection_name: str):
57
+ key, values = key_value
58
+ if isinstance(values, str):
59
+ values = [values]
60
+ filter = {"must": [models.FieldCondition(key=key, match=models.MatchAny(any=values))]}
61
+ self.delete_points(filter, collection_name)
src/testing.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from markdown import markdown
3
+ from langchain.text_splitter import MarkdownTextSplitter
4
+
5
+
6
+ path = "D:\PycharmProjects\polargs-docu-chat-rag\data\polars-docu\concepts\data-types-and-structures.md"
7
+
8
+ with open(path, 'r', encoding="utf8") as f_r:
9
+ test_md = f_r.read()
10
+
11
+ html = markdown(test_md)
12
+ text = ''.join(BeautifulSoup(html).findAll(text=True))
13
+
14
+ print(text[:10])
15
+
16
+ splitter = MarkdownTextSplitter(chunk_size=512, chunk_overlap=64)
17
+
18
+ docs = splitter.create_documents([text])
19
+ print(docs)
src/utils.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import islice
2
+
3
+
4
+ def batched(iterable, n, *, strict=False):
5
+ # batched('ABCDEFG', 3) → ABC DEF G
6
+ if n < 1:
7
+ raise ValueError('n must be at least one')
8
+ iterator = iter(iterable)
9
+ while batch := tuple(islice(iterator, n)):
10
+ if strict and len(batch) != n:
11
+ raise ValueError('batched(): incomplete batch')
12
+ yield batch