Spaces:
Running
on
Zero
Running
on
Zero
from calendar import month_name | |
from retriever import BuildRetriever, db_dir | |
import json | |
import os | |
import re | |
def get_collection(compute_mode): | |
""" | |
Returns the vectorstore collection. | |
Usage Examples: | |
# Number of child documents | |
collection = get_collection("remote") | |
len(collection["ids"]) | |
# Number of parent documents (unique doc_ids) | |
len(set([m["doc_id"] for m in collection["metadatas"]])) | |
""" | |
retriever = BuildRetriever(compute_mode, "dense") | |
return retriever.vectorstore.get() | |
def get_sources(): | |
""" | |
Return the source files indexed in the database, e.g. 'R-help/2024-April.txt'. | |
""" | |
# Path to your JSON Lines file | |
file_path = os.path.join(db_dir, "bm25", "corpus.jsonl") | |
# Reading the JSON Lines file | |
with open(file_path, "r", encoding="utf-8") as file: | |
# Parse each line as a JSON object | |
sources = [json.loads(line.strip())["metadata"]["source"] for line in file] | |
return sources | |
def get_start_end_months(sources): | |
""" | |
Given a set of filenames like 'R-help/2024-January.txt', return the earliest and latest month in 'Month YYYY' format. | |
""" | |
pattern = re.compile(r"R-help/(\d{4})-([A-Za-z]+)\.txt") | |
months = [] | |
# Start with the unique sources | |
unique_sources = set(sources) | |
for src in unique_sources: | |
m = pattern.match(src) | |
if m: | |
year = int(m.group(1)) | |
month_str = m.group(2) | |
try: | |
month_num = list(month_name).index(month_str) | |
except ValueError: | |
continue | |
if month_num == 0: | |
continue | |
months.append((year, month_num, month_str)) | |
if not months: | |
return None, None | |
months.sort() | |
start = months[0] | |
end = months[-1] | |
return f"{start[2]} {start[0]}", f"{end[2]} {end[0]}" | |