theRealNG commited on
Commit
3307cbd
·
1 Parent(s): 24cc3f2

Refactor: Moved to chains instead of crew

Browse files
crew/research_article_suggester.py CHANGED
@@ -1,20 +1,25 @@
1
  from crewai import Agent, Task, Crew
2
  from langchain_openai import ChatOpenAI
3
  from tavily import TavilyClient
 
4
  import os
5
  import json
6
  from pydantic import BaseModel, Field
7
  from crewai.tasks.task_output import TaskOutput
8
  from datetime import datetime, timedelta
 
 
 
9
 
10
- from tools.scrape_website import scrape_tool
11
 
12
- MAX_RESULTS = 5
13
  AGE_OF_RESEARCH_PAPER = 60
14
 
 
15
  class RecentArticleSuggester:
16
  """
17
- Suggests recent research articles based on a given topic.
18
  """
19
 
20
  def __init__(self):
@@ -27,24 +32,115 @@ class RecentArticleSuggester:
27
 
28
  def _suggest_research_papers(self):
29
  query = f"research papers on {self.topic} published in the last week"
30
- results = self.tavily_client.search(query, max_results=MAX_RESULTS)['results']
31
- print("Search Results: ", results)
32
- pitch_crew = self._create_pitch_crew()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  research_paper_suggestions = []
34
  for result in results:
35
  try:
36
- info = pitch_crew.kickoff(inputs={
37
- "title": result["title"],
38
- "url": result["url"],
39
- "content": result["content"]
40
- })
41
- research_paper_suggestions = research_paper_suggestions + \
42
- [info]
 
 
43
  except BaseException as e:
44
- print(f"Error processing article '{result['title']}': {e}")
 
45
 
46
  return research_paper_suggestions
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def _create_pitch_crew(self):
49
  information_gatherer = Agent(
50
  role="Research Paper Information Retriever",
@@ -64,12 +160,12 @@ class RecentArticleSuggester:
64
  date_obj = datetime.strptime(
65
  article_info['published_on'], "%d/%m/%Y")
66
 
67
- # Calculate the date that was 14 days ago from today
68
  start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER)
69
 
70
  # Compare if the input date is older
71
  if date_obj < start_date:
72
- raise BaseException(f"{date_obj} Older than given timeframe {start_date}")
 
73
 
74
  except ValueError:
75
  print("Invalid date format. Please use dd/mm/yyyy.")
 
1
  from crewai import Agent, Task, Crew
2
  from langchain_openai import ChatOpenAI
3
  from tavily import TavilyClient
4
+ import arxiv
5
  import os
6
  import json
7
  from pydantic import BaseModel, Field
8
  from crewai.tasks.task_output import TaskOutput
9
  from datetime import datetime, timedelta
10
+ from langchain_core.prompts import ChatPromptTemplate
11
+ from langchain_core.messages import SystemMessage, AIMessage, HumanMessage
12
+ from langchain_core.output_parsers import JsonOutputParser
13
 
14
+ from tools.scrape_website import scrape_tool, CustomScrapeWebsiteTool
15
 
16
+ MAX_RESULTS = 2
17
  AGE_OF_RESEARCH_PAPER = 60
18
 
19
+
20
  class RecentArticleSuggester:
21
  """
22
+ Suggests recent research papers based on a given topic.
23
  """
24
 
25
  def __init__(self):
 
32
 
33
  def _suggest_research_papers(self):
34
  query = f"research papers on {self.topic} published in the last week"
35
+ results = []
36
+ print("\nSearching for papers on Tavily...")
37
+ results = self.tavily_client.search(
38
+ query, max_results=MAX_RESULTS)['results']
39
+
40
+ print("\nSearching for papers on Arxiv...")
41
+ arxiv_results = arxiv.Search(
42
+ query=self.topic,
43
+ max_results=MAX_RESULTS,
44
+ sort_by=arxiv.SortCriterion.SubmittedDate
45
+ )
46
+ for result in arxiv_results.results():
47
+ paper = {
48
+ "title": result.title,
49
+ "authors": ", ".join(str(author) for author in result.authors),
50
+ "content": result.summary,
51
+ # "published_on": result.submitted.date(),
52
+ "url": result.entry_id,
53
+ "pdf_url": result.pdf_url
54
+ }
55
+ results.append(paper)
56
+
57
+ # pitch_crew = self._create_pitch_crew()
58
  research_paper_suggestions = []
59
  for result in results:
60
  try:
61
+ info = self._article_pitch(result)
62
+ # info = pitch_crew.kickoff(inputs={
63
+ # "title": result["title"],
64
+ # "url": result["url"],
65
+ # "content": result["content"]
66
+ # })
67
+ if info is not None:
68
+ research_paper_suggestions = research_paper_suggestions + \
69
+ [info]
70
  except BaseException as e:
71
+ print(
72
+ f"Error processing article '{result['title']}': {e}\n\n {e.__traceback__}")
73
 
74
  return research_paper_suggestions
75
 
76
+ def _gather_information(self, article):
77
+ print(f"\nScraping website: {article['url']}")
78
+ article_content = CustomScrapeWebsiteTool(article["url"])
79
+
80
+ print(f"\nGathering information from website: {article['url']}")
81
+ parser = JsonOutputParser(pydantic_object=ResearchPaper)
82
+ prompt_template = ChatPromptTemplate.from_messages([
83
+ SystemMessage(
84
+ "You are Research Paper Information Retriever. You are an expert in gathering required details about the given research paper."
85
+ "Your personal goal is: Retrieve the author information and date the research paper was published in the format of dd/mm/yyyy."
86
+ f"Formatting Instructions: {parser.get_format_instructions()}"
87
+ ),
88
+ HumanMessage(
89
+ f"Here is the information about the research paper title: {article['title']}, url: {article['url']},"
90
+ f" summary: \n{article['content']}.\n\n Research Paper content:\n{article_content}"
91
+ )
92
+ ])
93
+ llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
94
+ information_scrapper_chain = prompt_template | llm | parser
95
+
96
+ article_info = information_scrapper_chain.invoke({})
97
+ print("\nGathered Article Info: ", article_info)
98
+ article_info['article_content'] = article_content
99
+ return article_info
100
+
101
+ def _article_pitch(self, article):
102
+ article_info = self._gather_information(article)
103
+ try:
104
+ date_obj = datetime.strptime(
105
+ article_info['published_on'], "%d/%m/%Y")
106
+
107
+ start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER)
108
+
109
+ # Compare if the input date is older
110
+ if date_obj < start_date:
111
+ print(
112
+ f"\nRejecting research paper {article['title']} because it was published on {date_obj},"
113
+ f" which is before the expected timeframe {start_date} & {datetime.now()}")
114
+ return None
115
+
116
+ except ValueError:
117
+ print("Invalid date format. Please use dd/mm/yyyy.")
118
+ return None
119
+
120
+ print(f"\nCreating pitch for the research paper: {article['title']}")
121
+ pitch_parser = JsonOutputParser(pydantic_object=ResearchPaperWithPitch)
122
+ pitch_template = ChatPromptTemplate.from_messages([
123
+ SystemMessage(
124
+ "You are Curiosity Catalyst. As a Curiosity Catalyst, you know exactly how to pique the user's curiosity to read the research paper."
125
+ "Your personal goal is: To pique the user's curiosity to read the research paper."
126
+ "Read the Research Paper Content to create a pitch."
127
+ f"Formatting Instructions: {pitch_parser.get_format_instructions()}"
128
+ ),
129
+ HumanMessage(
130
+ f"Here is the information about the research paper title: {article_info['title']}, url: {article_info['url']}, "
131
+ f"published_on: {article_info['published_on']}, authors: {article_info['author']}, "
132
+ f"summary: \n{article_info['summary']}.\n\n Research Paper content:\n{article_info['article_content']}"
133
+ )
134
+ ])
135
+ pitch_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)
136
+ pitcher_chain = pitch_template | pitch_llm | pitch_parser
137
+
138
+ article_pitch = pitcher_chain.invoke({})
139
+ print("\nResearch Paper with the pitch: ", article_pitch)
140
+
141
+ return article_pitch
142
+
143
+ # Deprecated
144
  def _create_pitch_crew(self):
145
  information_gatherer = Agent(
146
  role="Research Paper Information Retriever",
 
160
  date_obj = datetime.strptime(
161
  article_info['published_on'], "%d/%m/%Y")
162
 
 
163
  start_date = datetime.now() - timedelta(days=AGE_OF_RESEARCH_PAPER)
164
 
165
  # Compare if the input date is older
166
  if date_obj < start_date:
167
+ raise BaseException(
168
+ f"{date_obj} Older than given timeframe {start_date}")
169
 
170
  except ValueError:
171
  print("Invalid date format. Please use dd/mm/yyyy.")
requirements.txt CHANGED
@@ -6,3 +6,4 @@ langchain_google_genai
6
  langchain_openai
7
  streamlit
8
  tavily-python
 
 
6
  langchain_openai
7
  streamlit
8
  tavily-python
9
+ arxiv
test.py CHANGED
@@ -2,4 +2,4 @@ from crew.research_article_suggester import RecentArticleSuggester
2
 
3
  suggester = RecentArticleSuggester()
4
  results = suggester.kickoff(inputs={"topic": "GenAI"})
5
- print(results)
 
2
 
3
  suggester = RecentArticleSuggester()
4
  results = suggester.kickoff(inputs={"topic": "GenAI"})
5
+ print("\nFinal Results: \n\n", results)
tools/scrape_website.py CHANGED
@@ -1,3 +1,14 @@
1
  from crewai_tools import ScrapeWebsiteTool
 
 
2
 
3
  scrape_tool = ScrapeWebsiteTool()
 
 
 
 
 
 
 
 
 
 
1
  from crewai_tools import ScrapeWebsiteTool
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
 
5
  scrape_tool = ScrapeWebsiteTool()
6
+
7
+ def CustomScrapeWebsiteTool(url):
8
+ response = requests.get(url)
9
+ parsed = BeautifulSoup(response.content, "html.parser")
10
+ text = parsed.get_text()
11
+ text = '\n'.join([i for i in text.split('\n') if i.strip() != ''])
12
+ text = ' '.join([i for i in text.split(' ') if i.strip() != ''])
13
+
14
+ return text