# Each `paper` is a dictionary containing:
#       (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal (8) embeddings
#
# Generate references:
#   `Reference` class:
#       1. Read a given .bib file to collect papers; use `search_paper_abstract` method to fill missing abstract.
#       2. Given some keywords; use Semantic Scholar API to find papers.
#       3. Generate bibtex from the selected papers. --> to_bibtex()
#       4. Generate prompts from the selected papers: --> to_prompts()
#               A sample prompt: {"paper_id": "paper summary"}

# todo: (1) citations & citedby of provided papers:
    #       load the pre-defined papers; use S2 to find all related works
    #       add all citations to `bib_papers`
    #       add all citedby to `bib_papers`
    #       use Semantic Scholar to find their embeddings
#       (2) separate references:
    #       divide references into different groups to reduce the tokens count
    #       for generating different paragraph of related works, use different set of references

import requests
import re
import bibtexparser
from scholarly import scholarly
from scholarly import ProxyGenerator


######################################################################################################################
# Some basic tools
######################################################################################################################
def remove_newlines(serie):
    # This function is applied to the abstract of each paper to reduce the length of prompts.
    serie = serie.replace('\n', ' ')
    serie = serie.replace('\\n', ' ')
    serie = serie.replace('  ', ' ')
    serie = serie.replace('  ', ' ')
    return serie


def search_paper_abstract(title):
    pg = ProxyGenerator()
    success = pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155") # todo: change this to env. var. for protection.
    if success:
        scholarly.use_proxy(pg)
        # input the title of a paper, return its abstract
        search_query = scholarly.search_pubs(title)
        found_paper = next(search_query)
    else:
        raise RuntimeError("ScraperAPI fails.")
    return remove_newlines(found_paper['bib']['abstract'])


def load_papers_from_bibtex(bib_file_path):
    with open(bib_file_path) as bibtex_file:
        bib_database = bibtexparser.load(bibtex_file)
    if len(bib_database.entries) == 0:
        return []
    else:
        bib_papers = []
        for bibitem in bib_database.entries:
            # Add each paper to `bib_papers`
            paper_id = bibitem.get("ID")
            title = bibitem.get("title")
            if title is None:
                continue
            journal = bibitem.get("journal")
            year = bibitem.get("year")
            author = bibitem.get("author")
            abstract = bibitem.get("abstract")
            if abstract is None:
                abstract = search_paper_abstract(title)
            result = {
                "paper_id": paper_id,
                "title": title,
                "link": "",
                "abstract": abstract,
                "authors": author,
                "year": year,
                "journal": journal
            }
            bib_papers.append(result)
        return bib_papers

######################################################################################################################
# Semantic Scholar (SS) API
######################################################################################################################
def ss_search(keywords, limit=20, fields=None):
    # space between the  query to be removed and replaced with +
    if fields is None:
        fields = ["title", "abstract", "venue", "year", "authors", "tldr", "embedding", "externalIds"]
    keywords = keywords.lower()
    keywords = keywords.replace(" ", "+")
    url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={keywords}&limit={limit}&fields={",".join(fields)}'
    # headers = {"Accept": "*/*", "x-api-key": constants.S2_KEY}
    headers = {"Accept": "*/*"}

    response = requests.get(url, headers=headers, timeout=30)
    return response.json()


def _collect_papers_ss(keyword, counts=3, tldr=False):
    def externalIds2link(externalIds):
        # Sample externalIds:
        #   "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}"
        if externalIds:
            # Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI
            # priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral)
            # DBLP
            dblp_id = externalIds.get('DBLP')
            if dblp_id is not None:
                dblp_link = f"dblp.org/rec/{dblp_id}"
                return dblp_link
            # arXiv
            arxiv_id = externalIds.get('ArXiv')
            if arxiv_id is not None:
                arxiv_link = f"arxiv.org/abs/{arxiv_id}"
                return arxiv_link
            return ""
        else:
            # if this is an empty dictionary, return an empty string
            return ""

    def extract_paper_id(last_name, year_str, title):
        pattern = r'^\w+'
        words = re.findall(pattern, title)
        # return last_name + year_str + title.split(' ', 1)[0]
        try:
            output = last_name + year_str + words[0]
        except IndexError:
            output = last_name + year_str + title[:4]
        return output

    def extract_author_info(raw_authors):
        authors = [author['name'] for author in raw_authors]

        authors_str = " and ".join(authors)
        try:
            last_name = authors[0].split()[-1]
        except IndexError:
            last_name = "ma"
        # pattern = r'^\w+'
        # last_name = re.findall(pattern, authors[0])
        return authors_str, last_name

    def parse_search_results(search_results_ss):
        if len(search_results_ss) == 0:
            return []

        # turn the search result to a list of paper dictionary.
        papers_ss = []
        for raw_paper in search_results_ss:
            if raw_paper["abstract"] is None:
                continue

            authors_str, last_name = extract_author_info(raw_paper['authors'])
            year_str = str(raw_paper['year'])
            title = raw_paper['title']

            # some journal may contain &; replace it. e.g. journal={IEEE Power & Energy Society General Meeting}
            journal = raw_paper['venue'].replace("&", "\\&")
            if not journal:
                journal = "arXiv preprint"

            paper_id = extract_paper_id(last_name, year_str, title).lower()
            link = externalIds2link(raw_paper['externalIds'])

            if tldr and raw_paper['tldr'] is not None:
                abstract = raw_paper['tldr']['text']
            else:
                abstract = remove_newlines(raw_paper['abstract'])
            embeddings = raw_paper['embedding']['vector']
            result = {
                "paper_id": paper_id,
                "title": title,
                "abstract": abstract,
                "link": link,
                "authors": authors_str,
                "year": year_str,
                "journal": journal,
                "embeddings": embeddings
            }
            papers_ss.append(result)
        return papers_ss

    raw_results = ss_search(keyword, limit=counts)
    if raw_results is not None:
        search_results = raw_results.get("data")
        if search_results is None:
            search_results = []
    else:
        search_results = []
    results = parse_search_results(search_results)
    return results

######################################################################################################################
# References Class
######################################################################################################################

class References:
    def __init__(self):
        # if load_papers:
        #     # todo: (1) too large bibtex may make have issues on token limitations; may truncate to 5 or 10
        #     #       (2) google scholar didn't give a full abstract for some papers ...
        #     #       (3) may use langchain to support long input
        #     self.papers = load_papers_from_bibtex(load_papers)
        # else:
        self.papers = {}

    def load_papers(self, bibtex, keyword):
        self.papers[keyword] = load_papers_from_bibtex(bibtex)

    def generate_keywords_dict(self):
        keywords_dict = {}
        for k in self.papers:
            keywords_dict[k] = len(self.papers[k])
        return keywords_dict

    def collect_papers(self, keywords_dict, tldr=False):
        """
        keywords_dict:
            {"machine learning": 5, "language model": 2};
            the first is the keyword, the second is how many references are needed.
        """
        for key, counts in keywords_dict.items():
            self.papers[key] = _collect_papers_ss(key, counts, tldr)

        # Remove duplicated references # todo: remove duplicated references in tex_processing procedure.

    def find_relevant(self, max_refs=30):
        # todo: use embeddings to evaluate
        pass

    def to_bibtex(self, path_to_bibtex="ref.bib"):
        """
        Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
        """
        papers = self._get_papers(keyword = "_all")

        # clear the bibtex file
        with open(path_to_bibtex, "w", encoding="utf-8") as file:
            file.write("")

        bibtex_entries = []
        paper_ids = []
        for paper in papers:
            bibtex_entry = f"""@article{{{paper["paper_id"]},
          title = {{{paper["title"]}}},
          author = {{{paper["authors"]}}}, 
          journal={{{paper["journal"]}}}, 
          year = {{{paper["year"]}}}, 
          url = {{{paper["link"]}}}
        }}"""
            bibtex_entries.append(bibtex_entry)
            paper_ids.append(paper["paper_id"])
            # Save the generated BibTeX entries to a file
            with open(path_to_bibtex, "a", encoding="utf-8") as file:
                file.write(bibtex_entry)
                file.write("\n\n")
        return paper_ids

    def _get_papers(self, keyword = "_all"):
        if keyword == "_all":
            papers = []
            for k, v in self.papers.items():
                papers = papers + v
        else:
            papers = self.papers["keyword"]
        return papers

    def to_prompts(self, keyword = "_all"):
        # `prompts`:
        #   {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"}
        #   this will be used to instruct GPT model to cite the correct bibtex entry.
        papers = self._get_papers(keyword)
        prompts = {}
        for paper in papers:
            prompts[paper["paper_id"]] = paper["abstract"]
        return prompts

    def to_json(self, keyword = "_all"):
        papers = self._get_papers(keyword)
        papers_json = {}
        for paper in papers:
            papers_json[paper["paper_id"]] = paper
        return papers_json


if __name__ == "__main__":
    # r = ss_search("Deep Q-Networks")['data']
    # print(r)
    # papers_json = {}
    # # for i in range(len(r)):
    # #     r[i]
    # #
    # # with open("Output.txt", "w") as text_file:
    # #     text_file.write("Purchase Amount: %s" % TotalAmount)
    # embeddings = r[0]['embedding']['vector']
    # print(embeddings)

    refs = References()
    keywords_dict = {
        "Deep Q-Networks": 5,
        "Actor-Critic Algorithms": 4,
        "Exploration-Exploitation Trade-off": 3
    }
    refs.collect_papers(keywords_dict, method="ss", tldr=True)
    for k in refs.papers:
        papers = refs.papers[k]
        print("keyword: ", k)
        for paper in papers:
            print(paper["paper_id"])

    refs.to_json()
    refs.to_bibtex()
    refs.to_prompts()
    # print(refs.papers)

    # todo: test load_papers
    # write test covering `references.py`. / fix this as a stable version

    # for p in refs.papers:
    #     print(p["paper_id"])
    # print(len(refs.papers))
    #
    # papers_json = refs.to_json()
    # # print(papers_json)
    # with open("papers.json", "w",  encoding='utf-8') as text_file:
    #     text_file.write(f"{papers_json}")


    # bib = "D:\\Projects\\auto-draft\\latex_templates\\pre_refs.bib"
    # papers = load_papers_from_bibtex(bib)
    # for paper in papers:
    #     print(paper)