pere commited on
Commit
c6f0cd5
1 Parent(s): d7b2d1e

deuncaser beta

Browse files
Files changed (3) hide show
  1. README.md +3 -4
  2. app.py +42 -0
  3. requirements.txt +4 -0
README.md CHANGED
@@ -1,13 +1,12 @@
1
  ---
2
  title: DeUnCaser
3
- emoji: 🐠
4
- colorFrom: blue
5
- colorTo: gray
6
  sdk: streamlit
7
  sdk_version: 1.9.0
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
 
1
  ---
2
  title: DeUnCaser
3
+ emoji: 🌖
4
+ colorFrom: green
5
+ colorTo: red
6
  sdk: streamlit
7
  sdk_version: 1.9.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ from transformers import T5ForConditionalGeneration, T5TokenizerFast, T5Config
4
+
5
+ #mytext= 'Vi bruker ikke tegnsetting eller store bokstaver når vi prater. Vi slår også sammen ord, og i praksis er dermed heller ikke mellomrom meningsbærende. Prøv å fjerne tegnsetting, store bokstaver og mellomrom fra dette avsnittet. Se om den nye North-T5-modellen greier å sette sammen til et nytt meningsbærende avsnitt.'
6
+ option_changed = 0
7
+
8
+ @st.cache(allow_output_mutation=True, suppress_st_warning=True)
9
+ def load_model():
10
+ model_name = "north/demo-deuncaser-base"
11
+ config = T5Config.from_pretrained(model_name)
12
+ model = T5ForConditionalGeneration.from_pretrained(model_name,config=config)
13
+ tokenizer = T5TokenizerFast.from_pretrained(model_name)
14
+ return (model, tokenizer)
15
+
16
+ def deuncase(model, tokenizer, text):
17
+ encoded_txt = tokenizer(text, return_tensors="pt")
18
+ generated_tokens = model.generate(
19
+ **encoded_txt
20
+ )
21
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
22
+
23
+ st.title("DeUnCaser")
24
+
25
+ expander = st.sidebar.expander("About")
26
+ expander.write("This web app adds spaces, punctation and capitalisation back into the text.")
27
+ expander.write("You can use the examples below, but too really test the effect of the model: Write or copy text from the Internet, and then manually remove spaces, puctation, cases etc. Try to restore the text.")
28
+
29
+
30
+ option = st.sidebar.selectbox(
31
+ "Examples:",
32
+ ("tirsdag var travel for ukrainas president volodymyr zelenskyj på morgenen tok han imot polens statsminister mateusz morawiecki","tirsdagvartravelforukrainaspresidentvolodymyrzelenskyjpåkveldentokhanimotpolensstatsministermateuszmorawiecki","deterikkelettåholderedepåstoreogsmåbokstavermanmåforeksempelhuskestorforbokstavnårmanskriveromkrimhalvøyamenkunbrukelitenforbokstavnårmanhenvisertilenkrimroman","detteerenlitendemosomerlagetavperegilkummervoldhanerenforskersomtidligerejobbetvednasjonalbiblioteketimoirana", "sentpå60talletvardetfaktisknoensomkalteungensinperegilkummervoldidagerdetikkelengersåvanligåbrukedobbeltnavninorgehvasynesduomdet"))
33
+
34
+ text = st.text_area(f"",max_chars=1000,value=option)
35
+
36
+ run = st.button('Run DeUnCaser')
37
+
38
+ if run:
39
+ model, tokenizer = load_model()
40
+ translated_text = deuncase(model, tokenizer, text)
41
+ st.write(translated_text[0] if translated_text else "Unknown Error Translating Text")
42
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ torch
3
+ transformers
4
+ transformers[sentencepiece]