Jueri commited on
Commit
1310c1d
1 Parent(s): 99e455f

initial commit

Browse files
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from clean_bibtex.clean_bibtex import get_url, get_dblp_bibtext, parse_bibtext_file_titles
3
+
4
+ DEFAULT_TEXT = """@inproceedings{DBLP:conf/naacl/DevlinCLT19,
5
+ author = {Jacob Devlin and
6
+ Ming{-}Wei Chang and
7
+ Kenton Lee and
8
+ Kristina Toutanova},
9
+ editor = {Jill Burstein and
10
+ Christy Doran and
11
+ Thamar Solorio},
12
+ title = {{BERT:} Pre-training of Deep Bidirectional Transformers for Language
13
+ Understanding},
14
+ booktitle = {Proceedings of the 2019 Conference of the North American Chapter of
15
+ the Association for Computational Linguistics: Human Language Technologies,
16
+ {NAACL-HLT} 2019, Minneapolis, MN, USA, June 2-7, 2019, Volume 1 (Long
17
+ and Short Papers)},
18
+ pages = {4171--4186},
19
+ publisher = {Association for Computational Linguistics},
20
+ year = {2019},
21
+ url = {https://doi.org/10.18653/v1/n19-1423},
22
+ doi = {10.18653/v1/n19-1423},
23
+ timestamp = {Fri, 06 Aug 2021 00:41:31 +0200},
24
+ biburl = {https://dblp.org/rec/conf/naacl/DevlinCLT19.bib},
25
+ bibsource = {dblp computer science bibliography, https://dblp.org}
26
+ }
27
+ """
28
+
29
+
30
+ def parse_titles(bibtex):
31
+ titles = []
32
+ lines = bibtex.split("\n")
33
+ print(lines)
34
+ for line in lines:
35
+ if line.strip().startswith("title"):
36
+ title = "".join(line.split("=")[1:])
37
+ title_clean = title.replace("{", "").replace("}", "").replace(",\n", "").strip()
38
+ titles.append(title_clean)
39
+ return titles
40
+
41
+
42
+ def cleaner(bibtex, keep_keys, file_obj):
43
+ dblp_citations = []
44
+ errors = []
45
+
46
+ if file_obj:
47
+ titles = parse_bibtext_file_titles(file_obj.name)
48
+
49
+ elif bibtex:
50
+ titles = parse_titles(bibtex)
51
+
52
+ for publication in titles:
53
+ if site_url := get_url(publication):
54
+ if dblp_citation := get_dblp_bibtext(site_url):
55
+ dblp_citations.append(dblp_citation)
56
+ else:
57
+ errors.append(" - " + publication)
58
+ else:
59
+ errors.append(" - " + publication)
60
+
61
+ if dblp_citations:
62
+ filenaem = "cleaned.bib"
63
+ with open(filenaem, "w") as outFile:
64
+ outFile.write("\n".join(dblp_citations))
65
+
66
+ return filenaem, "\n".join(errors)
67
+
68
+
69
+ iface = gr.Interface(
70
+ fn=cleaner,
71
+ title="Bibtex cleaner",
72
+ description="Clean a bibtex file or string simply by dragging the incomplete or broken bibtex file into the file box or pasting a bibtex string into the string field. The titles are extrected, searched at the DBLP and a clean bibtexfile is created.",
73
+ inputs=[
74
+ gr.inputs.Textbox(label="Paste a string here:", lines=1),
75
+ gr.inputs.Checkbox(label="Keep original keys:"),
76
+ gr.inputs.File(label="Drag a Bibtex file here:", file_count="single", type="file", optional=True),
77
+ ],
78
+ outputs=[
79
+ gr.outputs.File(label="Cleaned bibtext file:"),
80
+ gr.outputs.Textbox(type="auto", label="Errors:"),
81
+ ],
82
+ )
83
+ iface.launch()
clean_bibtex/__init__.py ADDED
File without changes
clean_bibtex/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (140 Bytes). View file
 
clean_bibtex/__pycache__/clean_bibtex.cpython-39.pyc ADDED
Binary file (3.63 kB). View file
 
clean_bibtex/clean_bibtex.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """This python script parses an incomplete BibTeX file to a BibTeX file with dblp references and styling.
3
+
4
+ Example:
5
+ python bibtext_to_dblp <bibtext input file> <output file>
6
+ """
7
+
8
+ import requests
9
+ import click
10
+ from typing import Optional
11
+ import time
12
+
13
+
14
+ def parse_bibtext_file_titles(file_path: str) -> list[str]:
15
+ """Function to parse the titles of the publications from a BibTeX file.
16
+
17
+ Args:
18
+ file_path (str): File path of the BibTeX file to parse.
19
+
20
+ Returns:
21
+ list[str]: List with the parsed titles.
22
+ """
23
+ try:
24
+ titles = []
25
+ with open(file_path, "r") as inFile:
26
+ for line in inFile.readlines():
27
+ if line.strip().startswith("title"):
28
+ title = "".join(line.split("=")[1:])
29
+ title_clean = title.replace("{", "").replace("}", "").replace(",\n", "").strip()
30
+ titles.append(title_clean)
31
+ return titles
32
+ except OSError as err:
33
+ print("OS error: {0}".format(err))
34
+ raise
35
+ except ValueError:
36
+ print("Could not parse, bibtext file is malformed.")
37
+ raise
38
+ except BaseException as err:
39
+ print(f"Unexpected {err}, {type(err)}")
40
+ raise
41
+
42
+
43
+ def get_url(title: str) -> Optional[str]:
44
+ """Search DBLP with a publication title and parse the pdf from the best result.json.
45
+
46
+ Args:
47
+ title (str): Title of the publication to search for.
48
+
49
+ Returns:
50
+ Optional[str]: URL of the DBLP page of the publication or None.
51
+ """
52
+ url = f"https://dblp.org/search/publ/api?q={title}&format=json"
53
+ result = requests.get(url)
54
+
55
+ try:
56
+ url = result.json()["result"]["hits"]["hit"][0]["info"]["url"]
57
+ return url
58
+ except:
59
+ return None
60
+
61
+
62
+ def get_dblp_bibtext(url: str) -> Optional[str]:
63
+ """Get the bibtext reference from a dblp publikation site url.
64
+
65
+ Args:
66
+ url (str): Url to the publication site.
67
+
68
+ Returns:
69
+ Optional[str]: Bibtex reference for the publication or None if an error occurred.
70
+ """
71
+ r = requests.get(url + ".bib")
72
+ if r.status_code == 200:
73
+ return r.text
74
+ else:
75
+ return None
76
+
77
+
78
+ @click.command()
79
+ @click.argument("input_file")
80
+ @click.argument("outpu_file")
81
+ def clean_bibtex(outpu_file: str, input_file: str):
82
+ """Convert an incomplete BibTeX file into a complete BibTeX file with dblp styling.
83
+
84
+ Args:
85
+ outpu_file (str): Destination for the new file.
86
+ input_file (str): Input file to parse bibtext citations from.
87
+ """
88
+ titles = parse_bibtext_file_titles(input_file)
89
+ errors = []
90
+ num_publications = str(len(titles))
91
+
92
+ click.echo("Requesting citation metadata for {num_publications} publications, this may take a while...")
93
+ with click.progressbar(length=len(titles)) as bar:
94
+ dblp_citations = []
95
+ for publication in titles:
96
+ if site_url := get_url(publication):
97
+ if dblp_citation := get_dblp_bibtext(site_url):
98
+ dblp_citations.append(dblp_citation)
99
+ else:
100
+ errors.append(" - " + publication)
101
+ else:
102
+ errors.append(" - " + publication)
103
+ time.sleep(1) # abide dblp crawl-delay
104
+ bar.update(1)
105
+
106
+ if dblp_citations:
107
+ with open(outpu_file, "w") as outFile:
108
+ outFile.write("\n".join(dblp_citations))
109
+ click.echo(f"\nNew BibTeX file written to: {outpu_file}")
110
+ else:
111
+ click.echo("No citations to write.")
112
+ if errors:
113
+ click.echo("\nCould not create citations for:")
114
+ click.echo("\n".join(errors))
115
+
116
+
117
+ if __name__ == "__main__":
118
+ clean_bibtex()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ requests
2
+ gradio