File size: 7,608 Bytes
e487cc6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
from fastapi import HTTPException, APIRouter, Request
from pydantic import BaseModel
import requests
from time import sleep
import spacy
from pymongo import MongoClient
# Load SciSpacy model
nlp = spacy.load("en_core_sci_sm")
router = APIRouter()
# Pydantic model for request body
class SearchRequest(BaseModel):
topic: str
year: int
userId: str # Add userId to the request body
# Function to extract keywords from user topic
def extract_keywords(text):
doc = nlp(text.lower())
noun_chunks = [chunk.text.strip() for chunk in doc.noun_chunks]
individual_tokens = [
token.text.strip() for token in doc
if token.pos_ in ["NOUN", "VERB"] and not token.is_stop
]
keywords = set(noun_chunks + individual_tokens)
cleaned_keywords = set()
for keyword in keywords:
if not any(keyword in chunk and keyword != chunk for chunk in noun_chunks):
cleaned_keywords.add(keyword)
return sorted(list(cleaned_keywords))
# Fetch works from OpenAlex based on refined topic
def fetch_works(refined_query, year, per_page=200):
OPENALEX_API_URL = f"https://api.openalex.org/works"
params = {
"filter": f"title_and_abstract.search:{refined_query},publication_year:{year}",
"per_page": per_page,
"cursor": "*"
}
all_results = []
while True:
response = requests.get(OPENALEX_API_URL, params=params)
if response.status_code != 200:
raise HTTPException(status_code=response.status_code, detail="Error fetching data from OpenAlex")
data = response.json()
all_results.extend(data.get("results", []))
next_cursor = data.get("meta", {}).get("next_cursor")
if next_cursor:
params["cursor"] = next_cursor
else:
break
sleep(0.2)
return all_results
# Batch fetch host organization details for sources
def batch_fetch_host_org_details(source_ids):
host_org_map = {}
batch_size = 100
for i in range(0, len(source_ids), batch_size):
batch = source_ids[i:i + batch_size]
openalex_ids = [src_id.split("/")[-1] for src_id in batch]
filter_query = "|".join(openalex_ids)
url = f"https://api.openalex.org/sources?filter=openalex_id:{filter_query}"
response = requests.get(url)
if response.status_code == 200:
sources = response.json().get("results", [])
for source in sources:
source_id = source.get("id")
# Handle the case where host_organization is a string (URL)
host_org_id = source.get("host_organization", "unknown")
host_org_name = source.get("host_organization_name", "unknown")
host_org_map[source_id] = {
"host_organization_name": host_org_name,
"host_organization_id": host_org_id
}
sleep(0.1)
else:
raise HTTPException(status_code=response.status_code, detail="Error fetching host organization details")
return host_org_map
# Extract metadata from works
def extract_metadata(works):
source_ids = list({
work["primary_location"]["source"]["id"]
for work in works
if work.get("primary_location") and work["primary_location"].get("source")
})
# Fetch host organization details (name and ID)
host_org_map = batch_fetch_host_org_details(source_ids)
metadata = []
for work in works:
primary_location = work.get("primary_location", {}) or {}
source = primary_location.get("source", {}) or {}
source_id = source.get("id")
host_org_details = host_org_map.get(source_id, {"host_organization_name": "Unknown", "host_organization_id": "Unknown"})
# Extract type field
work_type = (
work.get("type") or # Check the top-level "type" field
primary_location.get("type") or # Check "type" in primary_location
source.get("type") or # Check "type" in source
"unknown" # Fallback value
)
metadata.append({
"id": work.get("id"),
"is_oa": work.get("open_access", {}).get("is_oa", False),
"oa_status": work.get("open_access", {}).get("oa_status", "unknown"),
"venue": source.get("display_name", "unknown"),
"venue_id": source_id or "unknown",
"host_organization_name": host_org_details["host_organization_name"],
"host_organization_id": host_org_details["host_organization_id"],
"type": work_type,
"publication_date": work.get("publication_date", "unknown"),
})
return metadata
# Clean metadata by removing entries with unknown venue and host organization name
def clean_metadata(metadata):
initial_count = len(metadata)
cleaned_metadata = [
entry for entry in metadata
if not (entry["venue"] == "unknown" and entry["host_organization_name"] == "Unknown")
]
final_count = len(cleaned_metadata)
print(f"Total papers before cleaning: {initial_count}")
print(f"Total papers after cleaning: {final_count}")
return cleaned_metadata
# Save metadata to MongoDB
async def save_to_mongodb(userId, topic, year, metadata,request:Request):
document = {
"userId": userId,
"topic": topic,
"year": year,
"metadata": metadata
}
collection = request.app.state.collection2
await collection.update_one(
{"userId": userId, "topic": topic, "year": year},
{"$set": document},
upsert=True
)
print(f"Data saved to MongoDB for userId: {userId}, topic: {topic}, year: {year}")
# FastAPI endpoint
@router.post("/search/")
async def search(data:SearchRequest,request: Request):
userId = data.userId
topic = data.topic
year = data.year
# Extract keywords and refine query
keywords = extract_keywords(topic)
refined_query = "+".join(keywords)
print(f"Using refined search query: {refined_query}")
# Fetch works from OpenAlex
works = fetch_works(refined_query, year)
if not works:
raise HTTPException(status_code=404, detail="No works found for the given query")
# Extract metadata
metadata = extract_metadata(works)
# Clean metadata
cleaned_metadata = clean_metadata(metadata)
# Save metadata to MongoDB
await save_to_mongodb(userId, topic, year, cleaned_metadata,request)
# Return metadata as JSON response
return {"message": f"Data saved to MongoDB for userId: {userId}, topic: {topic}, year: {year}", "metadata": cleaned_metadata}
@router.post("/check-data-exists-venue/")
async def check_data_exists(request_data: SearchRequest, request:Request):
# Create a query to check if the data exists
query = {
"userId": request_data.userId,
"topic": request_data.topic
}
# Add year to query if it's provided
if request_data.year:
query["year"] = request_data.year
collection = request.app.state.collection2
# Check if a document matching the query exists
document = await collection.find_one(query) # Await the async operation
# Return result
return {
"exists": document is not None,
"message": "Data found" if document else "Data not found"
}
|