itsmariamaraki commited on
Commit
48a8c9f
β€’
1 Parent(s): 3bef4a4

Upload 7 files

Browse files
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ Semiconductors.pdf filter=lfs diff=lfs merge=lfs -text
Efficient_Estimation_of_Word_Representations.pdf ADDED
Binary file (229 kB). View file
 
Hidden_Technical_Debt.pdf ADDED
Binary file (166 kB). View file
 
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Assessment3
3
+ emoji: πŸ‘©πŸΌβ€πŸ’»
4
+ colorFrom: green
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 4.8.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Semiconductors.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b1adbf493d65309de9821c550e8456a9a0940623357b509ae19ef0345fe1e0a
3
+ size 2469240
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://huggingface.co/spaces/itsmariamaraki/AAI-Assessment3
2
+
3
+ # Here are the imports
4
+
5
+ import gradio as gr
6
+ import PyPDF2
7
+ from PyPDF2 import PdfReader
8
+ from pdfminer.high_level import extract_pages, extract_text
9
+ from transformers import pipeline, AutoProcessor, AutoModel, AutoTokenizer
10
+ import torch
11
+ import soundfile as sf
12
+ from IPython.display import Audio
13
+ from datasets import load_dataset
14
+ from io import BytesIO
15
+ import os
16
+
17
+ # Here is the code
18
+
19
+ def abstract(pdf_file):
20
+ pdf_bytes = BytesIO(pdf_file)
21
+ pdf_reader = PyPDF2.PdfReader(pdf_bytes)
22
+
23
+ abstract = ''
24
+
25
+ for page_number in range(len(pdf_reader.pages)):
26
+ text = pdf_reader.pages[page_number].extract_text()
27
+
28
+ if 'abstract' in text.lower(): #in order to read only the abstract, i set as a start the abstract point & as an end the introduction point
29
+ start_index = text.lower().find('abstract')
30
+ end_index = text.lower().find('introduction')
31
+ abstract = text[start_index:end_index]
32
+ break
33
+
34
+ return abstract
35
+
36
+
37
+
38
+ summarization = pipeline('summarization', model = 'pszemraj/long-t5-tglobal-base-16384-book-summary') #best summarization model i tested regarding this assessment
39
+ audiospeech = pipeline('text-to-speech', model = 'suno/bark-small') #the voice is a bit distorted but gives a good output & takes less time
40
+
41
+
42
+
43
+ def summarization_n_audiospeech(pdf_file):
44
+ abstract_text = abstract(pdf_file)
45
+
46
+ summary = summarization(abstract_text, max_length=50, min_length=10)[0]['summary_text'] #didn't know exactly what would give one sentence, so i checked multiple times the min & max lengths regarding the 11th article. for a dif article, those parameters would probably have to be different as well
47
+
48
+ fin_summary = summary.split('.')[0] + '.' #extract and print only the first sentence of the summary
49
+
50
+ #converting the summarization into an audio output
51
+ tts_output = audiospeech(fin_summary)
52
+ audio_data = tts_output['audio'][0]
53
+
54
+ with BytesIO() as buffer:
55
+ sf.write(buffer, audio_data, 16000, format = 'wav')
56
+ audio_bytes = buffer.getvalue()
57
+
58
+ return fin_summary, audio_bytes
59
+
60
+
61
+
62
+ iface = gr.Interface(
63
+ fn = summarization_n_audiospeech,
64
+ inputs = gr.File(label='upload PDF', type='binary'), #if i didn't set a type, the gradio output was an error - searched it online for the solution
65
+ outputs = [
66
+ gr.Textbox(label='Summarization of the Abstract:'),
67
+ gr.Audio(label="Audio Speech of the Abstract's Summary:")
68
+ ],
69
+ title = "PDF's Abstract Summarization & Audio Speech Processor",
70
+ description = "App that generates a one-line summary of the abstract & a speech audio of this summarization -- requirements: app only accepts PDFs which include an ABSTRACT section",
71
+ examples = [os.path.join(os.path.dirname(__file__), 'Hidden_Technical_Debt.pdf'),
72
+ os.path.join(os.path.dirname(__file__), 'Semiconductors.pdf'),
73
+ os.path.join(os.path.dirname(__file__), 'Efficient_Estimation_of_Word_Representations.pdf')
74
+ ]
75
+ )
76
+
77
+ iface.launch()
gitattributes ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Article[[:space:]]6[[:space:]]BloombergGPT_[[:space:]]A[[:space:]]Large[[:space:]]Language[[:space:]]Model[[:space:]]for[[:space:]]Finance.pdf filter=lfs diff=lfs merge=lfs -text
37
+ BloombergGPT.pdf filter=lfs diff=lfs merge=lfs -text
38
+ Article[[:space:]]8[[:space:]]Llama[[:space:]]2_[[:space:]]Open[[:space:]]Foundation[[:space:]]and[[:space:]]Fine-Tuned[[:space:]]Chat[[:space:]]Models.pdf filter=lfs diff=lfs merge=lfs -text
39
+ Llama_2.pdf filter=lfs diff=lfs merge=lfs -text
40
+ Article[[:space:]]10[[:space:]]The[[:space:]]Future[[:space:]]of[[:space:]]AI[[:space:]]is[[:space:]]Hybrid.pdf filter=lfs diff=lfs merge=lfs -text
41
+ The_Future_of_AI_is_Hybrid.pdf filter=lfs diff=lfs merge=lfs -text
42
+ Semiconductors.pdf filter=lfs diff=lfs merge=lfs -text
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ PyPDF2
3
+ torch
4
+ torchaudio
5
+ pdfplumber
6
+ pdfminer.six
7
+ datasets
8
+ sentencepiece
9
+ gradio
10
+ soundfile
11
+ Ipython
12
+ numpy