adowu commited on
Commit
4cf9a02
1 Parent(s): b380300

Update database.py

Browse files
Files changed (1) hide show
  1. database.py +93 -33
database.py CHANGED
@@ -6,19 +6,18 @@ import chromadb
6
  from chromadb.utils import embedding_functions
7
  from config import EMBEDDING_MODEL, DATABASE_DIR
8
 
9
- # Improved logging configuration
10
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
  logger = logging.getLogger(__name__)
12
 
13
  class KodeksProcessor:
14
  def __init__(self):
15
- logger.info(f"Initializing database client in directory: {DATABASE_DIR}")
16
  if not os.path.exists(DATABASE_DIR):
17
  os.makedirs(DATABASE_DIR)
18
- logger.info(f"Created directory {DATABASE_DIR}")
19
 
20
  self.client = chromadb.PersistentClient(path=DATABASE_DIR)
21
- logger.info("Database client initialized")
22
 
23
  try:
24
  self.collection = self.client.get_or_create_collection(
@@ -27,22 +26,62 @@ class KodeksProcessor:
27
  model_name=EMBEDDING_MODEL
28
  )
29
  )
30
- logger.info("Collection 'kodeksy' retrieved or created")
31
  except Exception as e:
32
- logger.error(f"Error while getting or creating collection: {e}")
33
  raise
34
 
35
  def extract_metadata(self, text: str) -> Dict:
36
  metadata = {}
37
- # ... (rest of the method remains the same)
38
- logger.info("Extracted metadata: %s", metadata)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  return metadata
40
 
41
  def split_header_and_content(self, text: str) -> Tuple[str, str]:
42
- # ... (method remains the same)
 
 
 
43
 
44
  def process_article(self, article_text: str) -> Dict:
45
- # ... (method remains the same)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
48
  chunks = []
@@ -61,27 +100,31 @@ class KodeksProcessor:
61
 
62
  if processed_article["has_paragraphs"]:
63
  for par_num, par_content in processed_article["paragraphs"]:
64
- chunks.append({
65
  "text": f"{article_title} §{par_num}. {par_content.strip()}",
66
  "metadata": {**chunk_metadata, "paragraph": par_num}
67
- })
 
 
68
  else:
69
- chunks.append({
70
  "text": processed_article["content"],
71
  "metadata": chunk_metadata
72
- })
 
 
73
 
74
- logger.info("Split text into %d chunks.", len(chunks))
75
  return chunks
76
 
77
  def process_file(self, filepath: str) -> None:
78
- logger.info("Processing file: %s", filepath)
79
 
80
  try:
81
  with open(filepath, 'r', encoding='utf-8') as file:
82
  content = file.read()
83
  except Exception as e:
84
- logger.error(f"Error reading file {filepath}: {e}")
85
  return
86
 
87
  header, main_content = self.split_header_and_content(content)
@@ -92,36 +135,51 @@ class KodeksProcessor:
92
 
93
  if chunks:
94
  try:
 
95
  self.collection.add(
96
  documents=[chunk["text"] for chunk in chunks],
97
  metadatas=[chunk["metadata"] for chunk in chunks],
98
  ids=[f"{metadata['filename']}_{chunk['metadata']['article']}_{i}" for i, chunk in enumerate(chunks)]
99
  )
100
- logger.info(f"Added {len(chunks)} chunks from file {metadata['filename']}")
 
101
  except Exception as e:
102
- logger.error(f"Error adding chunks to collection: {e}")
103
  else:
104
- logger.warning(f"No chunks to add from file: {filepath}")
105
 
106
  def process_all_files(self, directory: str) -> None:
107
- logger.info("Starting to process all files in directory: %s", directory)
108
- for filename in os.listdir(directory):
109
- if filename.endswith('.txt'):
110
- filepath = os.path.join(directory, filename)
111
- self.process_file(filepath)
112
- logger.info("Finished processing files.")
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  def search(self, query: str, n_results: int = 3) -> Dict:
115
- logger.info("Searching database for query: %s", query)
116
  try:
117
  results = self.collection.query(
118
  query_texts=[query],
119
  n_results=n_results
120
  )
121
- logger.info("Found %d results for query: %s", len(results['documents'][0]), query)
122
  return results
123
  except Exception as e:
124
- logger.error(f"Error during search: {e}")
125
  return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
126
 
127
  def list_all_documents(self) -> None:
@@ -129,13 +187,15 @@ class KodeksProcessor:
129
  all_docs = self.collection.get(include=['metadatas'])
130
  if all_docs['metadatas']:
131
  for metadata in all_docs['metadatas']:
132
- logger.info("Document: %s", metadata)
133
  else:
134
- logger.info("No documents in the database.")
135
  except Exception as e:
136
- logger.error(f"Error listing documents: {e}")
137
 
138
  if __name__ == "__main__":
139
  processor = KodeksProcessor()
140
  processor.process_all_files("data/kodeksy")
 
 
141
  processor.list_all_documents()
 
6
  from chromadb.utils import embedding_functions
7
  from config import EMBEDDING_MODEL, DATABASE_DIR
8
 
9
+ logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
 
10
  logger = logging.getLogger(__name__)
11
 
12
  class KodeksProcessor:
13
  def __init__(self):
14
+ logger.info(f"Inicjalizacja klienta bazy danych w katalogu: {DATABASE_DIR}")
15
  if not os.path.exists(DATABASE_DIR):
16
  os.makedirs(DATABASE_DIR)
17
+ logger.info(f"Utworzono katalog {DATABASE_DIR}")
18
 
19
  self.client = chromadb.PersistentClient(path=DATABASE_DIR)
20
+ logger.info("Klient bazy danych zainicjalizowany")
21
 
22
  try:
23
  self.collection = self.client.get_or_create_collection(
 
26
  model_name=EMBEDDING_MODEL
27
  )
28
  )
29
+ logger.info("Kolekcja 'kodeksy' pobrana lub utworzona")
30
  except Exception as e:
31
+ logger.error(f"Błąd podczas pobierania lub tworzenia kolekcji: {e}")
32
  raise
33
 
34
  def extract_metadata(self, text: str) -> Dict:
35
  metadata = {}
36
+ dz_u_match = re.search(r'Dz\.U\.(\d{4})\.(\d+)\.(\d+)', text)
37
+ if dz_u_match:
38
+ metadata['dz_u'] = f"Dz.U.{dz_u_match.group(1)}.{dz_u_match.group(2)}.{dz_u_match.group(3)}"
39
+ metadata['rok'] = dz_u_match.group(1)
40
+
41
+ nazwa_match = re.search(r'USTAWA\s+z dnia(.*?)\n(.*?)\n', text)
42
+ if nazwa_match:
43
+ metadata['data_ustawy'] = nazwa_match.group(1).strip()
44
+ metadata['nazwa'] = nazwa_match.group(2).strip()
45
+
46
+ zmiany = re.findall(r'(\d{4}-\d{2}-\d{2})\s+(zm\.\s+DZ\.U\.(\d{4})\.(\d+)\.(\d+)\s+art\.\s+(\d+)(?:\s+§\s+(\d+))?)', text)
47
+ if zmiany:
48
+ metadata['historia_zmian'] = [
49
+ {
50
+ 'data': data,
51
+ 'dz_u': f"Dz.U.{rok}.{numer}.{pozycja}",
52
+ 'artykul': artykul,
53
+ 'paragraf': paragraf if paragraf else None
54
+ }
55
+ for data, _, rok, numer, pozycja, artykul, paragraf in zmiany
56
+ ]
57
+
58
+ logger.debug(f"Wyodrębnione metadane: {metadata}")
59
  return metadata
60
 
61
  def split_header_and_content(self, text: str) -> Tuple[str, str]:
62
+ parts = text.split("USTAWA", 1)
63
+ if len(parts) > 1:
64
+ return parts[0], "USTAWA" + parts[1]
65
+ return "", text
66
 
67
  def process_article(self, article_text: str) -> Dict:
68
+ art_num_match = re.match(r'Art\.\s*(\d+[a-z]?)', article_text)
69
+ article_num = art_num_match.group(1) if art_num_match else ""
70
+
71
+ paragraphs = re.findall(r'§\s*(\d+)\.\s*(.*?)(?=§\s*\d+|Art\.\s*\d+|$)', article_text, re.DOTALL)
72
+
73
+ if not paragraphs:
74
+ return {
75
+ "article_num": article_num,
76
+ "content": article_text.strip(),
77
+ "has_paragraphs": False
78
+ }
79
+
80
+ return {
81
+ "article_num": article_num,
82
+ "paragraphs": paragraphs,
83
+ "has_paragraphs": True
84
+ }
85
 
86
  def split_into_chunks(self, text: str, metadata: Dict) -> List[Dict]:
87
  chunks = []
 
100
 
101
  if processed_article["has_paragraphs"]:
102
  for par_num, par_content in processed_article["paragraphs"]:
103
+ chunk = {
104
  "text": f"{article_title} §{par_num}. {par_content.strip()}",
105
  "metadata": {**chunk_metadata, "paragraph": par_num}
106
+ }
107
+ chunks.append(chunk)
108
+ logger.debug(f"Utworzono chunk: {chunk['text'][:100]}...")
109
  else:
110
+ chunk = {
111
  "text": processed_article["content"],
112
  "metadata": chunk_metadata
113
+ }
114
+ chunks.append(chunk)
115
+ logger.debug(f"Utworzono chunk: {chunk['text'][:100]}...")
116
 
117
+ logger.debug(f"Podzielono tekst na {len(chunks)} chunków.")
118
  return chunks
119
 
120
  def process_file(self, filepath: str) -> None:
121
+ logger.info(f"Przetwarzanie pliku: {filepath}")
122
 
123
  try:
124
  with open(filepath, 'r', encoding='utf-8') as file:
125
  content = file.read()
126
  except Exception as e:
127
+ logger.error(f"Błąd podczas odczytu pliku {filepath}: {e}")
128
  return
129
 
130
  header, main_content = self.split_header_and_content(content)
 
135
 
136
  if chunks:
137
  try:
138
+ logger.debug(f"Próba dodania {len(chunks)} chunków do kolekcji")
139
  self.collection.add(
140
  documents=[chunk["text"] for chunk in chunks],
141
  metadatas=[chunk["metadata"] for chunk in chunks],
142
  ids=[f"{metadata['filename']}_{chunk['metadata']['article']}_{i}" for i, chunk in enumerate(chunks)]
143
  )
144
+ logger.debug("Chunki dodane pomyślnie")
145
+ logger.info(f"Dodano {len(chunks)} chunków z pliku {metadata['filename']}")
146
  except Exception as e:
147
+ logger.error(f"Błąd podczas dodawania chunków do kolekcji: {e}")
148
  else:
149
+ logger.warning(f"Brak chunków do dodania z pliku: {filepath}")
150
 
151
  def process_all_files(self, directory: str) -> None:
152
+ logger.info(f"Rozpoczęcie przetwarzania wszystkich plików w katalogu: {directory}")
153
+ files = [f for f in os.listdir(directory) if f.endswith('.txt')]
154
+ logger.info(f"Znaleziono {len(files)} plików .txt")
155
+ for filename in files:
156
+ filepath = os.path.join(directory, filename)
157
+ self.process_file(filepath)
158
+ logger.info("Zakończono przetwarzanie plików.")
159
+
160
+ def verify_data_loading(self):
161
+ count = self.collection.count()
162
+ logger.info(f"Całkowita liczba dokumentów w kolekcji: {count}")
163
+ if count == 0:
164
+ logger.warning("Nie załadowano żadnych dokumentów do bazy danych.")
165
+
166
+ def test_search(self):
167
+ test_queries = ["kodeks karny", "art. 1", "przestępstwo"]
168
+ for query in test_queries:
169
+ results = self.search(query)
170
+ logger.info(f"Zapytanie testowe '{query}' zwróciło {len(results['documents'][0])} wyników")
171
 
172
  def search(self, query: str, n_results: int = 3) -> Dict:
173
+ logger.info(f"Wyszukiwanie w bazie danych dla zapytania: {query}")
174
  try:
175
  results = self.collection.query(
176
  query_texts=[query],
177
  n_results=n_results
178
  )
179
+ logger.info(f"Znaleziono {len(results['documents'][0])} wyników dla zapytania: {query}")
180
  return results
181
  except Exception as e:
182
+ logger.error(f"Błąd podczas wyszukiwania: {e}")
183
  return {"documents": [[]], "metadatas": [[]], "distances": [[]]}
184
 
185
  def list_all_documents(self) -> None:
 
187
  all_docs = self.collection.get(include=['metadatas'])
188
  if all_docs['metadatas']:
189
  for metadata in all_docs['metadatas']:
190
+ logger.info(f"Dokument: {metadata}")
191
  else:
192
+ logger.info("Brak dokumentów w bazie.")
193
  except Exception as e:
194
+ logger.error(f"Błąd podczas listowania dokumentów: {e}")
195
 
196
  if __name__ == "__main__":
197
  processor = KodeksProcessor()
198
  processor.process_all_files("data/kodeksy")
199
+ processor.verify_data_loading()
200
+ processor.test_search()
201
  processor.list_all_documents()