isayahc commited on
Commit
b11b693
2 Parent(s): 7b1a83a d41780e

Merge pull request #11 from almutareb/sqlite_for_sources

Browse files
app.py CHANGED
@@ -3,7 +3,10 @@ from hf_mixtral_agent import agent_executor
3
  from innovation_pathfinder_ai.source_container.container import (
4
  all_sources
5
  )
6
- from innovation_pathfinder_ai.utils import collect_urls
 
 
 
7
 
8
  if __name__ == "__main__":
9
 
@@ -13,7 +16,7 @@ if __name__ == "__main__":
13
 
14
  def bot(history):
15
  response = infer(history[-1][0], history)
16
- sources = collect_urls(all_sources)
17
  src_list = '\n'.join(sources)
18
  response_w_sources = response['output']+"\n\n\n Sources: \n\n\n"+src_list
19
  history[-1][1] = response_w_sources
 
3
  from innovation_pathfinder_ai.source_container.container import (
4
  all_sources
5
  )
6
+ from innovation_pathfinder_ai.utils.utils import extract_urls
7
+ from innovation_pathfinder_ai.utils import logger
8
+
9
+ logger = logger.get_console_logger("app")
10
 
11
  if __name__ == "__main__":
12
 
 
16
 
17
  def bot(history):
18
  response = infer(history[-1][0], history)
19
+ sources = extract_urls(all_sources)
20
  src_list = '\n'.join(sources)
21
  response_w_sources = response['output']+"\n\n\n Sources: \n\n\n"+src_list
22
  history[-1][1] = response_w_sources
hf_mixtral_agent.py CHANGED
@@ -1,15 +1,9 @@
1
  # HF libraries
2
  from langchain_community.llms import HuggingFaceEndpoint
3
- from langchain_core.prompts import ChatPromptTemplate
4
- from langchain import hub
5
- import gradio as gr
6
  from langchain.agents import AgentExecutor
7
  from langchain.agents.format_scratchpad import format_log_to_str
8
- from langchain.agents.output_parsers import (
9
- ReActJsonSingleInputOutputParser,
10
- )
11
  # Import things that are needed generically
12
- from typing import List, Dict
13
  from langchain.tools.render import render_text_description
14
  import os
15
  from dotenv import load_dotenv
@@ -17,12 +11,11 @@ from innovation_pathfinder_ai.structured_tools.structured_tools import (
17
  arxiv_search, get_arxiv_paper, google_search, wikipedia_search
18
  )
19
 
20
- # hacky and should be replaced with a database
21
- from innovation_pathfinder_ai.source_container.container import (
22
- all_sources
23
- )
24
  from langchain import PromptTemplate
25
  from innovation_pathfinder_ai.templates.react_json_with_memory import template_system
 
 
 
26
 
27
  config = load_dotenv(".env")
28
  HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
@@ -49,13 +42,6 @@ tools = [
49
  # get_arxiv_paper,
50
  ]
51
 
52
- tools_papers = [
53
- arxiv_search,
54
- get_arxiv_paper,
55
-
56
- ]
57
-
58
-
59
  prompt = PromptTemplate.from_template(
60
  template=template_system
61
  )
@@ -87,15 +73,4 @@ agent_executor = AgentExecutor(
87
  #max_execution_time=60, # timout at 60 sec
88
  return_intermediate_steps=True,
89
  handle_parsing_errors=True,
90
- )
91
-
92
- # instantiate AgentExecutor
93
- agent_executor_noweb = AgentExecutor(
94
- agent=agent,
95
- tools=tools_papers,
96
- verbose=True,
97
- max_iterations=6, # cap number of iterations
98
- #max_execution_time=60, # timout at 60 sec
99
- return_intermediate_steps=True,
100
- handle_parsing_errors=True,
101
  )
 
1
  # HF libraries
2
  from langchain_community.llms import HuggingFaceEndpoint
 
 
 
3
  from langchain.agents import AgentExecutor
4
  from langchain.agents.format_scratchpad import format_log_to_str
5
+ from langchain.agents.output_parsers import ReActJsonSingleInputOutputParser
 
 
6
  # Import things that are needed generically
 
7
  from langchain.tools.render import render_text_description
8
  import os
9
  from dotenv import load_dotenv
 
11
  arxiv_search, get_arxiv_paper, google_search, wikipedia_search
12
  )
13
 
 
 
 
 
14
  from langchain import PromptTemplate
15
  from innovation_pathfinder_ai.templates.react_json_with_memory import template_system
16
+ from innovation_pathfinder_ai.utils import logger
17
+
18
+ logger = logger.get_console_logger("hf_mixtral_agent")
19
 
20
  config = load_dotenv(".env")
21
  HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 
42
  # get_arxiv_paper,
43
  ]
44
 
 
 
 
 
 
 
 
45
  prompt = PromptTemplate.from_template(
46
  template=template_system
47
  )
 
73
  #max_execution_time=60, # timout at 60 sec
74
  return_intermediate_steps=True,
75
  handle_parsing_errors=True,
 
 
 
 
 
 
 
 
 
 
 
76
  )
innovation_pathfinder_ai/database/db_handler.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlmodel import SQLModel, create_engine, Session, select
2
+ from innovation_pathfinder_ai.database.schema import Sources
3
+ from innovation_pathfinder_ai.utils.logger import get_console_logger
4
+
5
+ sqlite_file_name = "innovation_pathfinder_ai/database/database.sqlite3"
6
+ sqlite_url = f"sqlite:///{sqlite_file_name}"
7
+ engine = create_engine(sqlite_url, echo=False)
8
+
9
+ logger = get_console_logger("db_handler")
10
+
11
+ SQLModel.metadata.create_all(engine)
12
+
13
+
14
+ def read_one(hash_id: dict):
15
+ with Session(engine) as session:
16
+ statement = select(Sources).where(Sources.hash_id == hash_id)
17
+ sources = session.exec(statement).first()
18
+ return sources
19
+
20
+
21
+ def add_one(data: dict):
22
+ with Session(engine) as session:
23
+ if session.exec(
24
+ select(Sources).where(Sources.hash_id == data.get("hash_id"))
25
+ ).first():
26
+ logger.warning(f"Item with hash_id {data.get('hash_id')} already exists")
27
+ return None # or raise an exception, or handle as needed
28
+ sources = Sources(**data)
29
+ session.add(sources)
30
+ session.commit()
31
+ session.refresh(sources)
32
+ logger.info(f"Item with hash_id {data.get('hash_id')} added to the database")
33
+ return sources
34
+
35
+
36
+ def update_one(hash_id: dict, data: dict):
37
+ with Session(engine) as session:
38
+ # Check if the item with the given hash_id exists
39
+ sources = session.exec(
40
+ select(Sources).where(Sources.hash_id == hash_id)
41
+ ).first()
42
+ if not sources:
43
+ logger.warning(f"No item with hash_id {hash_id} found for update")
44
+ return None # or raise an exception, or handle as needed
45
+ for key, value in data.items():
46
+ setattr(sources, key, value)
47
+ session.commit()
48
+ logger.info(f"Item with hash_id {hash_id} updated in the database")
49
+ return sources
50
+
51
+
52
+ def delete_one(id: int):
53
+ with Session(engine) as session:
54
+ # Check if the item with the given hash_id exists
55
+ sources = session.exec(
56
+ select(Sources).where(Sources.hash_id == id)
57
+ ).first()
58
+ if not sources:
59
+ logger.warning(f"No item with hash_id {id} found for deletion")
60
+ return None # or raise an exception, or handle as needed
61
+ session.delete(sources)
62
+ session.commit()
63
+ logger.info(f"Item with hash_id {id} deleted from the database")
64
+
65
+
66
+ def add_many(data: list):
67
+ with Session(engine) as session:
68
+ for info in data:
69
+ # Reuse add_one function for each item
70
+ result = add_one(info)
71
+ if result is None:
72
+ logger.warning(
73
+ f"Item with hash_id {info.get('hash_id')} could not be added"
74
+ )
75
+ else:
76
+ logger.info(
77
+ f"Item with hash_id {info.get('hash_id')} added to the database"
78
+ )
79
+ session.commit() # Commit at the end of the loop
80
+
81
+
82
+ def delete_many(ids: list):
83
+ with Session(engine) as session:
84
+ for id in ids:
85
+ # Reuse delete_one function for each item
86
+ result = delete_one(id)
87
+ if result is None:
88
+ logger.warning(f"No item with hash_id {id} found for deletion")
89
+ else:
90
+ logger.info(f"Item with hash_id {id} deleted from the database")
91
+ session.commit() # Commit at the end of the loop
92
+
93
+
94
+ def read_all(query: dict = None):
95
+ with Session(engine) as session:
96
+ statement = select(Sources)
97
+ if query:
98
+ statement = statement.where(
99
+ *[getattr(Sources, key) == value for key, value in query.items()]
100
+ )
101
+ sources = session.exec(statement).all()
102
+ return sources
103
+
104
+
105
+ def delete_all():
106
+ with Session(engine) as session:
107
+ session.exec(Sources).delete()
108
+ session.commit()
109
+ logger.info("All items deleted from the database")
innovation_pathfinder_ai/database/schema.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlmodel import SQLModel, Field
2
+ from typing import Optional
3
+
4
+ import datetime
5
+
6
+ class Sources(SQLModel, table=True):
7
+ id: Optional[int] = Field(default=None, primary_key=True)
8
+ url: str = Field()
9
+ title: Optional[str] = Field(default="NA", unique=False)
10
+ hash_id: str = Field(unique=True)
11
+ created_at: float = Field(default=datetime.datetime.now().timestamp())
12
+ summary: str = Field(default="")
13
+ embedded: bool = Field(default=False)
14
+
15
+ __table_args__ = {"extend_existing": True}
innovation_pathfinder_ai/structured_tools/structured_tools.py CHANGED
@@ -6,31 +6,32 @@ from langchain_community.utilities import WikipediaAPIWrapper
6
  #from langchain.tools import Tool
7
  from langchain_community.utilities import GoogleSearchAPIWrapper
8
  import arxiv
9
-
10
  # hacky and should be replaced with a database
11
  from innovation_pathfinder_ai.source_container.container import (
12
  all_sources
13
  )
14
- from innovation_pathfinder_ai.utils import create_wikipedia_urls_from_text
 
 
 
 
 
15
 
16
  @tool
17
  def arxiv_search(query: str) -> str:
18
  """Search arxiv database for scientific research papers and studies. This is your primary information source.
19
  always check it first when you search for information, before using any other tool."""
20
- # return "LangChain"
21
  global all_sources
22
- arxiv_retriever = ArxivRetriever(load_max_docs=2)
23
  data = arxiv_retriever.invoke(query)
24
  meta_data = [i.metadata for i in data]
25
- # meta_data += all_sources
26
- # all_sources += meta_data
27
- all_sources += meta_data
28
-
29
- # formatted_info = format_info(entry_id, published, title, authors)
30
-
31
- # formatted_info = format_info_list(all_sources)
32
-
33
- return meta_data.__str__()
34
 
35
  @tool
36
  def get_arxiv_paper(paper_id:str) -> None:
@@ -52,17 +53,13 @@ def get_arxiv_paper(paper_id:str) -> None:
52
  @tool
53
  def google_search(query: str) -> str:
54
  """Search Google for additional results when you can't answer questions using arxiv search or wikipedia search."""
55
- # return "LangChain"
56
  global all_sources
57
 
58
  websearch = GoogleSearchAPIWrapper()
59
- search_results:dict = websearch.results(query, 5)
60
-
61
-
62
- #organic_source = search_results['organic_results']
63
- # formatted_string = "Title: {title}, link: {link}, snippet: {snippet}".format(**organic_source)
64
- cleaner_sources = ["Title: {title}, link: {link}, snippet: {snippet}".format(**i) for i in search_results]
65
-
66
  all_sources += cleaner_sources
67
 
68
  return cleaner_sources.__str__()
@@ -75,5 +72,9 @@ def wikipedia_search(query: str) -> str:
75
  api_wrapper = WikipediaAPIWrapper()
76
  wikipedia_search = WikipediaQueryRun(api_wrapper=api_wrapper)
77
  wikipedia_results = wikipedia_search.run(query)
78
- all_sources += create_wikipedia_urls_from_text(wikipedia_results)
79
- return wikipedia_results
 
 
 
 
 
6
  #from langchain.tools import Tool
7
  from langchain_community.utilities import GoogleSearchAPIWrapper
8
  import arxiv
9
+ import ast
10
  # hacky and should be replaced with a database
11
  from innovation_pathfinder_ai.source_container.container import (
12
  all_sources
13
  )
14
+ from innovation_pathfinder_ai.utils.utils import (
15
+ parse_list_to_dicts, format_wiki_summaries, format_arxiv_documents, format_search_results
16
+ )
17
+ from innovation_pathfinder_ai.database.db_handler import (
18
+ add_many
19
+ )
20
 
21
  @tool
22
  def arxiv_search(query: str) -> str:
23
  """Search arxiv database for scientific research papers and studies. This is your primary information source.
24
  always check it first when you search for information, before using any other tool."""
 
25
  global all_sources
26
+ arxiv_retriever = ArxivRetriever(load_max_docs=3)
27
  data = arxiv_retriever.invoke(query)
28
  meta_data = [i.metadata for i in data]
29
+ formatted_sources = format_arxiv_documents(data)
30
+ all_sources += formatted_sources
31
+ parsed_sources = parse_list_to_dicts(formatted_sources)
32
+ add_many(parsed_sources)
33
+
34
+ return data.__str__()
 
 
 
35
 
36
  @tool
37
  def get_arxiv_paper(paper_id:str) -> None:
 
53
  @tool
54
  def google_search(query: str) -> str:
55
  """Search Google for additional results when you can't answer questions using arxiv search or wikipedia search."""
 
56
  global all_sources
57
 
58
  websearch = GoogleSearchAPIWrapper()
59
+ search_results:dict = websearch.results(query, 3)
60
+ cleaner_sources =format_search_results(search_results)
61
+ parsed_csources = parse_list_to_dicts(cleaner_sources)
62
+ add_many(parsed_csources)
 
 
 
63
  all_sources += cleaner_sources
64
 
65
  return cleaner_sources.__str__()
 
72
  api_wrapper = WikipediaAPIWrapper()
73
  wikipedia_search = WikipediaQueryRun(api_wrapper=api_wrapper)
74
  wikipedia_results = wikipedia_search.run(query)
75
+ formatted_summaries = format_wiki_summaries(wikipedia_results)
76
+ all_sources += formatted_summaries
77
+ parsed_summaries = parse_list_to_dicts(formatted_summaries)
78
+ add_many(parsed_summaries)
79
+
80
+ return wikipedia_results.__str__()
innovation_pathfinder_ai/utils.py DELETED
@@ -1,42 +0,0 @@
1
- def create_wikipedia_urls_from_text(text):
2
- """
3
- Extracts page titles from a given text and constructs Wikipedia URLs for each title.
4
-
5
- Args:
6
- - text (str): A string containing multiple sections, each starting with "Page:" followed by the title.
7
-
8
- Returns:
9
- - list: A list of Wikipedia URLs constructed from the extracted titles.
10
- """
11
- # Split the text into sections based on "Page:" prefix
12
- sections = text.split("Page: ")
13
- # Remove the first item if it's empty (in case the text starts with "Page:")
14
- if sections[0].strip() == "":
15
- sections = sections[1:]
16
-
17
- urls = [] # Initialize an empty list to store the URLs
18
- for section in sections:
19
- # Extract the title, which is the string up to the first newline
20
- title = section.split("\n", 1)[0]
21
- # Replace spaces with underscores for the URL
22
- url_title = title.replace(" ", "_")
23
- # Construct the URL and add it to the list
24
- url = f"https://en.wikipedia.org/wiki/{url_title}"
25
- urls.append(url)
26
-
27
- return urls
28
-
29
- def collect_urls(data_list):
30
- urls = []
31
- for item in data_list:
32
- # Check if item is a string and contains 'link:'
33
- if isinstance(item, str) and 'link:' in item:
34
- start = item.find('link:') + len('link: ')
35
- end = item.find(',', start)
36
- url = item[start:end if end != -1 else None].strip()
37
- urls.append(url)
38
- # Check if item is a dictionary and has 'Entry ID'
39
- elif isinstance(item, dict) and 'Entry ID' in item:
40
- urls.append(item['Entry ID'])
41
- last_sources = urls[-3:]
42
- return last_sources
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
innovation_pathfinder_ai/utils/logger.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # logger.py
2
+
3
+ import logging
4
+ from rich.logging import RichHandler
5
+ from typing import Optional
6
+
7
+
8
+ def get_console_logger(name: Optional[str] = "default") -> logging.Logger:
9
+ logger = logging.getLogger(name)
10
+ if not logger.handlers:
11
+ logger.setLevel(logging.DEBUG)
12
+ console_handler = RichHandler()
13
+ console_handler.setLevel(logging.DEBUG)
14
+ formatter = logging.Formatter(
15
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
16
+ )
17
+ console_handler.setFormatter(formatter)
18
+ logger.addHandler(console_handler)
19
+
20
+ return logger
innovation_pathfinder_ai/utils/utils.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import datetime
3
+
4
+ from innovation_pathfinder_ai.utils import logger
5
+
6
+ logger = logger.get_console_logger("utils")
7
+
8
+ def create_wikipedia_urls_from_text(text):
9
+ """
10
+ Extracts page titles from a given text and constructs Wikipedia URLs for each title.
11
+
12
+ Args:
13
+ - text (str): A string containing multiple sections, each starting with "Page:" followed by the title.
14
+
15
+ Returns:
16
+ - list: A list of Wikipedia URLs constructed from the extracted titles.
17
+ """
18
+ # Split the text into sections based on "Page:" prefix
19
+ sections = text.split("Page: ")
20
+ # Remove the first item if it's empty (in case the text starts with "Page:")
21
+ if sections[0].strip() == "":
22
+ sections = sections[1:]
23
+
24
+ urls = [] # Initialize an empty list to store the URLs
25
+ for section in sections:
26
+ # Extract the title, which is the string up to the first newline
27
+ title = section.split("\n", 1)[0]
28
+ # Replace spaces with underscores for the URL
29
+ url_title = title.replace(" ", "_")
30
+ # Construct the URL and add it to the list
31
+ url = f"https://en.wikipedia.org/wiki/{url_title}"
32
+ urls.append(url)
33
+ print(urls)
34
+
35
+ return urls
36
+
37
+ def extract_urls(data_list):
38
+ """
39
+ Extracts URLs from a list of of dictionaries.
40
+
41
+ Parameters:
42
+ - formatted_list (list): A list of dictionaries, each containing 'Title:', 'link:', and 'summary:'.
43
+
44
+ Returns:
45
+ - list: A list of URLs extracted from the dictionaries.
46
+ """
47
+ urls = []
48
+ print(data_list)
49
+ for item in data_list:
50
+ try:
51
+ # Find the start and end indices of the URL
52
+ lower_case = item.lower()
53
+ link_prefix = 'link: '
54
+ summary_prefix = ', summary:'
55
+ start_idx = lower_case.index(link_prefix) + len(link_prefix)
56
+ end_idx = lower_case.index(summary_prefix, start_idx)
57
+ # Extract the URL using the indices found
58
+ url = item[start_idx:end_idx]
59
+ urls.append(url)
60
+ except ValueError:
61
+ # Handles the case where 'link: ' or ', summary:' is not found in the string
62
+ print("Could not find a URL in the item:", item)
63
+ last_sources = urls[-3:]
64
+ return last_sources
65
+
66
+ def format_wiki_summaries(input_text):
67
+ """
68
+ Parses a given text containing page titles and summaries, formats them into a list of strings,
69
+ and appends Wikipedia URLs based on titles.
70
+
71
+ Parameters:
72
+ - input_text (str): A string containing titles and summaries separated by specific markers.
73
+
74
+ Returns:
75
+ - list: A list of formatted strings with titles, summaries, and Wikipedia URLs.
76
+ """
77
+ # Splitting the input text into individual records based on double newlines
78
+ records = input_text.split("\n\n")
79
+
80
+ formatted_records_with_urls = []
81
+ for record in records:
82
+ if "Page:" in record and "Summary:" in record:
83
+ title_line, summary_line = record.split("\n", 1) # Splitting only on the first newline
84
+ title = title_line.replace("Page: ", "").strip()
85
+ summary = summary_line.replace("Summary: ", "").strip()
86
+ # Replace spaces with underscores for the URL and construct the Wikipedia URL
87
+ url_title = title.replace(" ", "_")
88
+ wikipedia_url = f"https://en.wikipedia.org/wiki/{url_title}"
89
+ # Append formatted string with title, summary, and URL
90
+ formatted_record = "Title: {title}, Link: {wikipedia_url}, Summary: {summary}".format(
91
+ title=title, summary=summary, wikipedia_url=wikipedia_url)
92
+ formatted_records_with_urls.append(formatted_record)
93
+ else:
94
+ print("Record format error, skipping record:", record)
95
+
96
+ return formatted_records_with_urls
97
+
98
+ def format_arxiv_documents(documents):
99
+ """
100
+ Formats a list of document objects into a list of strings.
101
+ Each document object is assumed to have a 'metadata' dictionary with 'Title' and 'Entry ID',
102
+ and a 'page_content' attribute for content.
103
+
104
+ Parameters:
105
+ - documents (list): A list of document objects.
106
+
107
+ Returns:
108
+ - list: A list of formatted strings with titles, links, and content snippets.
109
+ """
110
+ formatted_documents = [
111
+ "Title: {title}, Link: {link}, Summary: {snippet}".format(
112
+ title=doc.metadata['Title'],
113
+ link=doc.metadata['Entry ID'],
114
+ snippet=doc.page_content # Adjust the snippet length as needed
115
+ )
116
+ for doc in documents
117
+ ]
118
+ return formatted_documents
119
+
120
+ def format_search_results(search_results):
121
+ """
122
+ Formats a list of dictionaries containing search results into a list of strings.
123
+ Each dictionary is expected to have the keys 'title', 'link', and 'snippet'.
124
+
125
+ Parameters:
126
+ - search_results (list): A list of dictionaries, each containing 'title', 'link', and 'snippet'.
127
+
128
+ Returns:
129
+ - list: A list of formatted strings based on the search results.
130
+ """
131
+ formatted_results = [
132
+ "Title: {title}, Link: {link}, Summary: {snippet}".format(**i)
133
+ for i in search_results
134
+ ]
135
+ return formatted_results
136
+
137
+ def parse_list_to_dicts(items: list) -> list:
138
+ parsed_items = []
139
+ for item in items:
140
+ # Extract title, link, and summary from each string
141
+ title_start = item.find('Title: ') + len('Title: ')
142
+ link_start = item.find('Link: ') + len('Link: ')
143
+ summary_start = item.find('Summary: ') + len('Summary: ')
144
+
145
+ title_end = item.find(', Link: ')
146
+ link_end = item.find(', Summary: ')
147
+ summary_end = len(item)
148
+
149
+ title = item[title_start:title_end]
150
+ link = item[link_start:link_end]
151
+ summary = item[summary_start:summary_end]
152
+
153
+ # Use the hash_text function for the hash_id
154
+ hash_id = hash_text(link)
155
+
156
+ # Construct the dictionary for each item
157
+ parsed_item = {
158
+ "url": link,
159
+ "title": title,
160
+ "hash_id": hash_id,
161
+ "summary": summary
162
+ }
163
+ parsed_items.append(parsed_item)
164
+ return parsed_items
165
+
166
+ def hash_text(text: str) -> str:
167
+ return hashlib.md5(text.encode()).hexdigest()
168
+
169
+
170
+ def convert_timestamp_to_datetime(timestamp: str) -> str:
171
+ return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S")
requirements.txt CHANGED
@@ -8,4 +8,6 @@ wikipedia
8
  gradio==3.48.0
9
  chromadb
10
  google_api_python_client
11
- pypdf2
 
 
 
8
  gradio==3.48.0
9
  chromadb
10
  google_api_python_client
11
+ pypdf2
12
+ sqlmodel
13
+ rich