Spaces:
Runtime error
Runtime error
Jueri
commited on
Commit
•
1310c1d
1
Parent(s):
99e455f
initial commit
Browse files- app.py +83 -0
- clean_bibtex/__init__.py +0 -0
- clean_bibtex/__pycache__/__init__.cpython-39.pyc +0 -0
- clean_bibtex/__pycache__/clean_bibtex.cpython-39.pyc +0 -0
- clean_bibtex/clean_bibtex.py +118 -0
- requirements.txt +2 -0
app.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from clean_bibtex.clean_bibtex import get_url, get_dblp_bibtext, parse_bibtext_file_titles
|
3 |
+
|
4 |
+
DEFAULT_TEXT = """@inproceedings{DBLP:conf/naacl/DevlinCLT19,
|
5 |
+
author = {Jacob Devlin and
|
6 |
+
Ming{-}Wei Chang and
|
7 |
+
Kenton Lee and
|
8 |
+
Kristina Toutanova},
|
9 |
+
editor = {Jill Burstein and
|
10 |
+
Christy Doran and
|
11 |
+
Thamar Solorio},
|
12 |
+
title = {{BERT:} Pre-training of Deep Bidirectional Transformers for Language
|
13 |
+
Understanding},
|
14 |
+
booktitle = {Proceedings of the 2019 Conference of the North American Chapter of
|
15 |
+
the Association for Computational Linguistics: Human Language Technologies,
|
16 |
+
{NAACL-HLT} 2019, Minneapolis, MN, USA, June 2-7, 2019, Volume 1 (Long
|
17 |
+
and Short Papers)},
|
18 |
+
pages = {4171--4186},
|
19 |
+
publisher = {Association for Computational Linguistics},
|
20 |
+
year = {2019},
|
21 |
+
url = {https://doi.org/10.18653/v1/n19-1423},
|
22 |
+
doi = {10.18653/v1/n19-1423},
|
23 |
+
timestamp = {Fri, 06 Aug 2021 00:41:31 +0200},
|
24 |
+
biburl = {https://dblp.org/rec/conf/naacl/DevlinCLT19.bib},
|
25 |
+
bibsource = {dblp computer science bibliography, https://dblp.org}
|
26 |
+
}
|
27 |
+
"""
|
28 |
+
|
29 |
+
|
30 |
+
def parse_titles(bibtex):
|
31 |
+
titles = []
|
32 |
+
lines = bibtex.split("\n")
|
33 |
+
print(lines)
|
34 |
+
for line in lines:
|
35 |
+
if line.strip().startswith("title"):
|
36 |
+
title = "".join(line.split("=")[1:])
|
37 |
+
title_clean = title.replace("{", "").replace("}", "").replace(",\n", "").strip()
|
38 |
+
titles.append(title_clean)
|
39 |
+
return titles
|
40 |
+
|
41 |
+
|
42 |
+
def cleaner(bibtex, keep_keys, file_obj):
|
43 |
+
dblp_citations = []
|
44 |
+
errors = []
|
45 |
+
|
46 |
+
if file_obj:
|
47 |
+
titles = parse_bibtext_file_titles(file_obj.name)
|
48 |
+
|
49 |
+
elif bibtex:
|
50 |
+
titles = parse_titles(bibtex)
|
51 |
+
|
52 |
+
for publication in titles:
|
53 |
+
if site_url := get_url(publication):
|
54 |
+
if dblp_citation := get_dblp_bibtext(site_url):
|
55 |
+
dblp_citations.append(dblp_citation)
|
56 |
+
else:
|
57 |
+
errors.append(" - " + publication)
|
58 |
+
else:
|
59 |
+
errors.append(" - " + publication)
|
60 |
+
|
61 |
+
if dblp_citations:
|
62 |
+
filenaem = "cleaned.bib"
|
63 |
+
with open(filenaem, "w") as outFile:
|
64 |
+
outFile.write("\n".join(dblp_citations))
|
65 |
+
|
66 |
+
return filenaem, "\n".join(errors)
|
67 |
+
|
68 |
+
|
69 |
+
iface = gr.Interface(
|
70 |
+
fn=cleaner,
|
71 |
+
title="Bibtex cleaner",
|
72 |
+
description="Clean a bibtex file or string simply by dragging the incomplete or broken bibtex file into the file box or pasting a bibtex string into the string field. The titles are extrected, searched at the DBLP and a clean bibtexfile is created.",
|
73 |
+
inputs=[
|
74 |
+
gr.inputs.Textbox(label="Paste a string here:", lines=1),
|
75 |
+
gr.inputs.Checkbox(label="Keep original keys:"),
|
76 |
+
gr.inputs.File(label="Drag a Bibtex file here:", file_count="single", type="file", optional=True),
|
77 |
+
],
|
78 |
+
outputs=[
|
79 |
+
gr.outputs.File(label="Cleaned bibtext file:"),
|
80 |
+
gr.outputs.Textbox(type="auto", label="Errors:"),
|
81 |
+
],
|
82 |
+
)
|
83 |
+
iface.launch()
|
clean_bibtex/__init__.py
ADDED
File without changes
|
clean_bibtex/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (140 Bytes). View file
|
|
clean_bibtex/__pycache__/clean_bibtex.cpython-39.pyc
ADDED
Binary file (3.63 kB). View file
|
|
clean_bibtex/clean_bibtex.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""This python script parses an incomplete BibTeX file to a BibTeX file with dblp references and styling.
|
3 |
+
|
4 |
+
Example:
|
5 |
+
python bibtext_to_dblp <bibtext input file> <output file>
|
6 |
+
"""
|
7 |
+
|
8 |
+
import requests
|
9 |
+
import click
|
10 |
+
from typing import Optional
|
11 |
+
import time
|
12 |
+
|
13 |
+
|
14 |
+
def parse_bibtext_file_titles(file_path: str) -> list[str]:
|
15 |
+
"""Function to parse the titles of the publications from a BibTeX file.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
file_path (str): File path of the BibTeX file to parse.
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
list[str]: List with the parsed titles.
|
22 |
+
"""
|
23 |
+
try:
|
24 |
+
titles = []
|
25 |
+
with open(file_path, "r") as inFile:
|
26 |
+
for line in inFile.readlines():
|
27 |
+
if line.strip().startswith("title"):
|
28 |
+
title = "".join(line.split("=")[1:])
|
29 |
+
title_clean = title.replace("{", "").replace("}", "").replace(",\n", "").strip()
|
30 |
+
titles.append(title_clean)
|
31 |
+
return titles
|
32 |
+
except OSError as err:
|
33 |
+
print("OS error: {0}".format(err))
|
34 |
+
raise
|
35 |
+
except ValueError:
|
36 |
+
print("Could not parse, bibtext file is malformed.")
|
37 |
+
raise
|
38 |
+
except BaseException as err:
|
39 |
+
print(f"Unexpected {err}, {type(err)}")
|
40 |
+
raise
|
41 |
+
|
42 |
+
|
43 |
+
def get_url(title: str) -> Optional[str]:
|
44 |
+
"""Search DBLP with a publication title and parse the pdf from the best result.json.
|
45 |
+
|
46 |
+
Args:
|
47 |
+
title (str): Title of the publication to search for.
|
48 |
+
|
49 |
+
Returns:
|
50 |
+
Optional[str]: URL of the DBLP page of the publication or None.
|
51 |
+
"""
|
52 |
+
url = f"https://dblp.org/search/publ/api?q={title}&format=json"
|
53 |
+
result = requests.get(url)
|
54 |
+
|
55 |
+
try:
|
56 |
+
url = result.json()["result"]["hits"]["hit"][0]["info"]["url"]
|
57 |
+
return url
|
58 |
+
except:
|
59 |
+
return None
|
60 |
+
|
61 |
+
|
62 |
+
def get_dblp_bibtext(url: str) -> Optional[str]:
|
63 |
+
"""Get the bibtext reference from a dblp publikation site url.
|
64 |
+
|
65 |
+
Args:
|
66 |
+
url (str): Url to the publication site.
|
67 |
+
|
68 |
+
Returns:
|
69 |
+
Optional[str]: Bibtex reference for the publication or None if an error occurred.
|
70 |
+
"""
|
71 |
+
r = requests.get(url + ".bib")
|
72 |
+
if r.status_code == 200:
|
73 |
+
return r.text
|
74 |
+
else:
|
75 |
+
return None
|
76 |
+
|
77 |
+
|
78 |
+
@click.command()
|
79 |
+
@click.argument("input_file")
|
80 |
+
@click.argument("outpu_file")
|
81 |
+
def clean_bibtex(outpu_file: str, input_file: str):
|
82 |
+
"""Convert an incomplete BibTeX file into a complete BibTeX file with dblp styling.
|
83 |
+
|
84 |
+
Args:
|
85 |
+
outpu_file (str): Destination for the new file.
|
86 |
+
input_file (str): Input file to parse bibtext citations from.
|
87 |
+
"""
|
88 |
+
titles = parse_bibtext_file_titles(input_file)
|
89 |
+
errors = []
|
90 |
+
num_publications = str(len(titles))
|
91 |
+
|
92 |
+
click.echo("Requesting citation metadata for {num_publications} publications, this may take a while...")
|
93 |
+
with click.progressbar(length=len(titles)) as bar:
|
94 |
+
dblp_citations = []
|
95 |
+
for publication in titles:
|
96 |
+
if site_url := get_url(publication):
|
97 |
+
if dblp_citation := get_dblp_bibtext(site_url):
|
98 |
+
dblp_citations.append(dblp_citation)
|
99 |
+
else:
|
100 |
+
errors.append(" - " + publication)
|
101 |
+
else:
|
102 |
+
errors.append(" - " + publication)
|
103 |
+
time.sleep(1) # abide dblp crawl-delay
|
104 |
+
bar.update(1)
|
105 |
+
|
106 |
+
if dblp_citations:
|
107 |
+
with open(outpu_file, "w") as outFile:
|
108 |
+
outFile.write("\n".join(dblp_citations))
|
109 |
+
click.echo(f"\nNew BibTeX file written to: {outpu_file}")
|
110 |
+
else:
|
111 |
+
click.echo("No citations to write.")
|
112 |
+
if errors:
|
113 |
+
click.echo("\nCould not create citations for:")
|
114 |
+
click.echo("\n".join(errors))
|
115 |
+
|
116 |
+
|
117 |
+
if __name__ == "__main__":
|
118 |
+
clean_bibtex()
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
requests
|
2 |
+
gradio
|