Spaces:

DrishtiSharma
/

linkedin-post-generator

Running

App Files Files Community

linkedin-post-generator / preprocess.py

DrishtiSharma

Upload 6 files

765a4ee verified 12 days ago

raw

history blame

3.43 kB

	import json
	from llm_helper import llm
	from langchain_core.prompts import PromptTemplate
	from langchain_core.output_parsers import JsonOutputParser
	from langchain_core.exceptions import OutputParserException


	def process_posts(raw_file_path, processed_file_path=None):
	with open(raw_file_path, encoding='utf-8') as file:
	posts = json.load(file)
	enriched_posts = []
	for post in posts:
	metadata = extract_metadata(post['text'])
	post_with_metadata = post \| metadata
	enriched_posts.append(post_with_metadata)

	unified_tags = get_unified_tags(enriched_posts)
	for post in enriched_posts:
	current_tags = post['tags']
	new_tags = {unified_tags[tag] for tag in current_tags}
	post['tags'] = list(new_tags)

	with open(processed_file_path, encoding='utf-8', mode="w") as outfile:
	json.dump(enriched_posts, outfile, indent=4)


	def extract_metadata(post):
	template = '''
	You are given a LinkedIn post. You need to extract number of lines, language of the post and tags.
	1. Return a valid JSON. No preamble.
	2. JSON object should have exactly three keys: line_count, language and tags.
	3. tags is an array of text tags. Extract maximum two tags.
	4. Language should be English or Hinglish (Hinglish means hindi + english)

	Here is the actual post on which you need to perform this task:
	{post}
	'''

	pt = PromptTemplate.from_template(template)
	chain = pt \| llm
	response = chain.invoke(input={"post": post})

	try:
	json_parser = JsonOutputParser()
	res = json_parser.parse(response.content)
	except OutputParserException:
	raise OutputParserException("Context too big. Unable to parse jobs.")
	return res


	def get_unified_tags(posts_with_metadata):
	unique_tags = set()
	# Loop through each post and extract the tags
	for post in posts_with_metadata:
	unique_tags.update(post['tags']) # Add the tags to the set

	unique_tags_list = ','.join(unique_tags)

	template = '''I will give you a list of tags. You need to unify tags with the following requirements,
	1. Tags are unified and merged to create a shorter list.
	Example 1: "Jobseekers", "Job Hunting" can be all merged into a single tag "Job Search".
	Example 2: "Motivation", "Inspiration", "Drive" can be mapped to "Motivation"
	Example 3: "Personal Growth", "Personal Development", "Self Improvement" can be mapped to "Self Improvement"
	Example 4: "Scam Alert", "Job Scam" etc. can be mapped to "Scams"
	2. Each tag should be follow title case convention. example: "Motivation", "Job Search"
	3. Output should be a JSON object, No preamble
	3. Output should have mapping of original tag and the unified tag.
	For example: {{"Jobseekers": "Job Search", "Job Hunting": "Job Search", "Motivation": "Motivation}}

	Here is the list of tags:
	{tags}
	'''
	pt = PromptTemplate.from_template(template)
	chain = pt \| llm
	response = chain.invoke(input={"tags": str(unique_tags_list)})
	try:
	json_parser = JsonOutputParser()
	res = json_parser.parse(response.content)
	except OutputParserException:
	raise OutputParserException("Context too big. Unable to parse jobs.")
	return res


	if __name__ == "__main__":
	process_posts("data/raw_posts.json", "data/processed_posts.json")