Ceyda Cinarel commited on
Commit
00c162d
β€’
2 Parent(s): 080042f 40c9da3

Merge pull request #1 from jqueguiner/streamlit-to-ovh-ai-platofrm

Browse files
Files changed (3) hide show
  1. Dockerfile +13 -0
  2. dl_all_data.py +14 -0
  3. run_me.sh +1 -0
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.7
2
+
3
+ EXPOSE 8501
4
+
5
+ WORKDIR /workspace
6
+
7
+ COPY requirements.txt common_voice.py ./
8
+
9
+ RUN pip install -r requirements.txt
10
+
11
+ EXPOSE 8501
12
+
13
+ CMD ["streamlit", "run", "common_voice.py"]
dl_all_data.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ def cache_dataset(language,split=None):
4
+ dat=load_dataset("common_voice",language,split=split)
5
+ if split:
6
+ return pd.DataFrame(dat)
7
+ else:
8
+ return dat
9
+
10
+ language_codes=['ab', 'ar', 'as', 'br', 'ca', 'cnh', 'cs', 'cv', 'cy', 'de', 'dv', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'fy-NL', 'ga-IE', 'hi', 'hsb', 'hu', 'ia', 'id', 'it', 'ja', 'ka', 'kab', 'ky', 'lg', 'lt', 'lv', 'mn', 'mt', 'nl', 'or', 'pa-IN', 'pl', 'pt', 'rm-sursilv', 'rm-vallader', 'ro', 'ru', 'rw', 'sah', 'sl', 'sv-SE', 'ta', 'th', 'tr', 'tt', 'uk', 'vi', 'vot', 'zh-CN', 'zh-HK', 'zh-TW']
11
+
12
+ pool = Pool(processes=len(language_codes))
13
+
14
+ pool.map(cache_dataset, language_codes)
run_me.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ovhai job run -v cache@GRA/dataset/cache:/root/.cache:RW:cache -p 8501 -v cache@GRA/workspace:/workspace:RW:cache --unsecure-http databuzzword/streamlit-common-voice