jobanpreet123 commited on
Commit
11dec1a
1 Parent(s): 2f84696

scrapping code changed

Browse files
__pycache__/advance_post.cpython-310.pyc ADDED
Binary file (3.46 kB). View file
 
__pycache__/paraphrase_post.cpython-310.pyc ADDED
Binary file (2.96 kB). View file
 
__pycache__/scrap_post.cpython-310.pyc ADDED
Binary file (717 Bytes). View file
 
advance_post.py CHANGED
@@ -10,7 +10,7 @@ import nest_asyncio
10
  def google_search(linkedin_post,model , google_api_key, search_engine_id , num_results_per_query=[3,2,1]):
11
 
12
  response_schemas = [
13
- ResponseSchema(name="answer", description="These are the top three relevant questions from the LinkedIn post" , type="list")]
14
  output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
15
  format_instructions = output_parser.get_format_instructions()
16
 
@@ -29,7 +29,7 @@ def google_search(linkedin_post,model , google_api_key, search_engine_id , num_r
29
 
30
  chain = prompt | model | output_parser
31
  result=chain.invoke({"post": linkedin_post})
32
- questions=result['answer']
33
  # print(questions)
34
 
35
  all_links = []
@@ -61,7 +61,7 @@ def google_search(linkedin_post,model , google_api_key, search_engine_id , num_r
61
  # result=chain.invoke({'post':linkedinpost , 'content':docs})
62
  # return result , docs
63
 
64
-
65
  def advanced_post(all_links ,model ,linkedinpost):
66
  loader = WebBaseLoader(all_links,encoding="utf-8")
67
  loader.requests_per_second = 1
 
10
  def google_search(linkedin_post,model , google_api_key, search_engine_id , num_results_per_query=[3,2,1]):
11
 
12
  response_schemas = [
13
+ ResponseSchema(name="questions", description="These are the top three relevant questions from the LinkedIn post" , type="list")]
14
  output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
15
  format_instructions = output_parser.get_format_instructions()
16
 
 
29
 
30
  chain = prompt | model | output_parser
31
  result=chain.invoke({"post": linkedin_post})
32
+ questions=result['questions']
33
  # print(questions)
34
 
35
  all_links = []
 
61
  # result=chain.invoke({'post':linkedinpost , 'content':docs})
62
  # return result , docs
63
 
64
+ nest_asyncio.apply()
65
  def advanced_post(all_links ,model ,linkedinpost):
66
  loader = WebBaseLoader(all_links,encoding="utf-8")
67
  loader.requests_per_second = 1
app.py CHANGED
@@ -3,7 +3,7 @@ import re
3
  import openai
4
  from paraphrase_post import get_original_url , paraphrased_post
5
  from advance_post import google_search , advanced_post
6
- from langchain.chat_models import ChatOpenAI
7
  from langchain_groq import ChatGroq
8
  #from langchain import HuggingFaceHub
9
 
 
3
  import openai
4
  from paraphrase_post import get_original_url , paraphrased_post
5
  from advance_post import google_search , advanced_post
6
+ from langchain_community.chat_models import ChatOpenAI
7
  from langchain_groq import ChatGroq
8
  #from langchain import HuggingFaceHub
9
 
paraphrase_post.py CHANGED
@@ -1,4 +1,3 @@
1
- from langchain_community.document_loaders import WebBaseLoader
2
  from langchain.prompts import ChatPromptTemplate
3
  from langchain.output_parsers import ResponseSchema
4
  from langchain.output_parsers import StructuredOutputParser
@@ -45,6 +44,7 @@ def get_original_url(url):
45
  def paraphrased_post(url,model):
46
 
47
  post=scrappost(url)
 
48
 
49
  template="""You are a helpful paraphraser tool. You are provided with a content and your task is to paraphrase it.
50
  {data}"""
 
 
1
  from langchain.prompts import ChatPromptTemplate
2
  from langchain.output_parsers import ResponseSchema
3
  from langchain.output_parsers import StructuredOutputParser
 
44
  def paraphrased_post(url,model):
45
 
46
  post=scrappost(url)
47
+ print(post)
48
 
49
  template="""You are a helpful paraphraser tool. You are provided with a content and your task is to paraphrase it.
50
  {data}"""
scrap_post.py CHANGED
@@ -1,33 +1,13 @@
1
-
2
- from unstructured.partition.html import partition_html
3
- #source = 'https://www.linkedin.com/posts/jobanpreet-singh-392581207_asr-whisper-speechrecognition-activity-7172803455718158336-MC-j?utm_source=share&utm_medium=member_desktop'
4
 
5
 
6
  def scrappost(url):
7
- all_groups = []
8
- group = {'page_content': ''}
9
-
10
- # ingest and preprocess webpage into Unstructured elements object
11
- glossary_page = partition_html(url=url)
12
-
13
- # iterate the document elements and group texts by title
14
- for element in glossary_page:
15
- if 'unstructured.documents.html.HTMLTitle' in str(type(element)):
16
- # If there's already content in the group, add it to all_groups
17
- if group['page_content']:
18
- all_groups.append(group)
19
- group = {'page_content': ''}
20
- group['page_content'] += element.text
21
- if 'unstructured.documents.html.HTMLNarrativeText' in str(type(element)):
22
- group['page_content'] += element.text
23
-
24
- if "unstructured.documents.html.HTMLListItem" in str(type(element)):
25
- group['page_content']+=element.text
26
-
27
- # # Add the last group if it exists
28
- if group['page_content']:
29
- all_groups.append(group)
30
-
31
- # Print the groups
32
- for group in all_groups[:1]:
33
- return group["page_content"]
 
1
+ import requests
2
+ import json
3
+ from bs4 import BeautifulSoup
4
 
5
 
6
  def scrappost(url):
7
+ response = requests.get(url)
8
+ soup = BeautifulSoup(response.text, 'html.parser')
9
+ span_tags = soup.find_all('script',type="application/ld+json")
10
+ content_list = [tag.get_text() for tag in span_tags]
11
+ for content in content_list:
12
+ data=json.loads(content)['articleBody']
13
+ return data