boris commited on
Commit
a96c347
·
1 Parent(s): e226ca6

feat(text): use hf_hub for wiki word count

Browse files
Files changed (1) hide show
  1. dalle_mini/text.py +5 -13
dalle_mini/text.py CHANGED
@@ -2,36 +2,28 @@
2
  Utilities for processing text.
3
  """
4
 
5
- import requests
6
  from pathlib import Path
7
  from unidecode import unidecode
8
 
9
  import re, math, random, html
10
  import ftfy
11
 
12
- WIKI_STATS_URL = "https://github.com/borisdayma/wikipedia-word-frequency/raw/feat-update/results/enwiki-20210820-words-frequency.txt"
13
- WIKI_STATS_LOCAL = Path(WIKI_STATS_URL).parts[-1]
14
 
15
  # based on wiki word occurence
16
  person_token = [("a person", 282265), ("someone", 121194), ("somebody", 12219)]
17
  temp_token = "xtokx" # avoid repeating chars
18
 
19
 
20
- def get_wiki_file():
21
- if not Path(WIKI_STATS_LOCAL).exists():
22
- r = requests.get(WIKI_STATS_URL, stream=True)
23
- with open(WIKI_STATS_LOCAL, "wb") as fd:
24
- for chunk in r.iter_content(chunk_size=128):
25
- fd.write(chunk)
26
- return WIKI_STATS_LOCAL
27
-
28
-
29
  class HashtagProcessor:
30
  # Adapted from wordninja library
31
  # We use our wikipedia word count + a good heuristic to make it work
32
  def __init__(self):
 
 
 
33
  self._word_cost = (
34
- l.split()[0] for l in Path(get_wiki_file()).read_text().splitlines()
35
  )
36
  self._word_cost = {
37
  str(k): math.log(float(i + 1)) for i, k in enumerate(self._word_cost)
 
2
  Utilities for processing text.
3
  """
4
 
 
5
  from pathlib import Path
6
  from unidecode import unidecode
7
 
8
  import re, math, random, html
9
  import ftfy
10
 
11
+ from huggingface_hub import hf_hub_download
 
12
 
13
  # based on wiki word occurence
14
  person_token = [("a person", 282265), ("someone", 121194), ("somebody", 12219)]
15
  temp_token = "xtokx" # avoid repeating chars
16
 
17
 
 
 
 
 
 
 
 
 
 
18
  class HashtagProcessor:
19
  # Adapted from wordninja library
20
  # We use our wikipedia word count + a good heuristic to make it work
21
  def __init__(self):
22
+ wiki_word_frequency = hf_hub_download(
23
+ "dalle-mini/dalle-mini", filename="enwiki-words-frequency.txt"
24
+ )
25
  self._word_cost = (
26
+ l.split()[0] for l in Path(wiki_word_frequency).read_text().splitlines()
27
  )
28
  self._word_cost = {
29
  str(k): math.log(float(i + 1)) for i, k in enumerate(self._word_cost)