edugp commited on
Commit
d131aa3
1 Parent(s): 6d1a001

Add tests and fix issue when splitting into sentences, to grab the minimum number between total sentences and sample size, rather than total original documents and sample size

Browse files
README.md CHANGED
@@ -20,7 +20,7 @@ The app is hosted [here](https://huggingface.co/spaces/edugp/perplexity-lenses).
20
  python -m streamlit run app.py
21
  ```
22
 
23
- # CLI
24
  The CLI with no arguments defaults to running mc4 in Spanish.
25
  For full usage:
26
  ```
@@ -40,3 +40,7 @@ python cli.py \
40
  --model-name distiluse-base-multilingual-cased-v1 \
41
  --output-file perplexity.html
42
  ```
 
 
 
 
20
  python -m streamlit run app.py
21
  ```
22
 
23
+ # CLI:
24
  The CLI with no arguments defaults to running mc4 in Spanish.
25
  For full usage:
26
  ```
40
  --model-name distiluse-base-multilingual-cased-v1 \
41
  --output-file perplexity.html
42
  ```
43
+ # Tests:
44
+ ```
45
+ python -m unittest discover -s ./tests/ -p "test_*.py"
46
+ ```
perplexity_lenses/data.py CHANGED
@@ -40,4 +40,4 @@ def hub_dataset_to_dataframe(
40
 
41
  def documents_df_to_sentences_df(df: pd.DataFrame, text_column: str, sample: int, seed: int = 0):
42
  df_sentences = pd.DataFrame({text_column: np.array(df[text_column].map(lambda x: x.split("\n")).values.tolist()).flatten()})
43
- return df_sentences.sample(min(sample, df.shape[0]), random_state=seed)
40
 
41
  def documents_df_to_sentences_df(df: pd.DataFrame, text_column: str, sample: int, seed: int = 0):
42
  df_sentences = pd.DataFrame({text_column: np.array(df[text_column].map(lambda x: x.split("\n")).values.tolist()).flatten()})
43
+ return df_sentences.sample(min(sample, df_sentences.shape[0]), random_state=seed)
tests/__init__.py ADDED
File without changes
tests/test_data.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ import pandas as pd
4
+
5
+ from perplexity_lenses.data import documents_df_to_sentences_df
6
+
7
+
8
+ class TestData(unittest.TestCase):
9
+ def test_documents_df_to_sentences_df(self):
10
+ input_df = pd.DataFrame({"text": ["foo\nbar"]})
11
+ expected_output_df = pd.DataFrame({"text": ["foo", "bar"]})
12
+ output_df = documents_df_to_sentences_df(input_df, "text", 100)
13
+ pd.testing.assert_frame_equal(output_df, expected_output_df, check_like=True, check_exact=True)