import json import uuid from langchain.text_splitter import CharacterTextSplitter text_splitter = CharacterTextSplitter( separator="\n", chunk_size=3000, chunk_overlap=0 ) def generate_uuid(): return str(uuid.uuid4()) def check_id_extis_in_json(file_id): with open('file_ids.json', 'r') as f: file_ids = json.load(f) if file_id in file_ids: return True else: return False def compare_paper_ids(data, paper_ids): existing_dois = {item['doi_no'] for item in data} missing_paper_ids = [paper_id for paper_id in paper_ids if paper_id not in existing_dois] return missing_paper_ids def extract_json_from_text(text): text = str(text) # print("text",text) try: # Find the JSON part within the text start_index = text.find('{') end_index = text.rfind('}') + 1 json_part = text[start_index:end_index] json_part = json.loads(json_part.lower()) print("json",type(json_part)) print(json_part) return json_part.get('data', []) except Exception as e: print(f"\033[31m Exception occurred while loading JSON: {str(e)} [0m") return text