zao1234 commited on
Commit
62074b2
1 Parent(s): fd690bc

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +97 -7
README.md CHANGED
@@ -1,10 +1,100 @@
1
  ---
2
- title: README
3
- emoji: 📉
4
- colorFrom: pink
5
- colorTo: indigo
6
- sdk: static
7
- pinned: false
8
  ---
 
 
 
 
9
 
10
- Edit this `README.md` markdown file to author your organization card.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ license: mit
3
+ datasets:
4
+ - HermesPenn/athena_data
 
 
 
5
  ---
6
+ # News Classifier -- The Evaluation Pipeline
7
+ ## The colab link: https://colab.research.google.com/drive/1OmIHVN0joIgjGgYCdqLu2EO2By4yT5Xd#scrollTo=MsmKRoHuHyIp
8
+ ## Ziao You, Samuel Vara, Surya Sandeep Akella
9
+ ----------------------
10
 
11
+ ## The codes here are the same as the colab link. It shows how to call our model to evaluate the test set. Please use the colab link for easier usage.
12
+ ##
13
+
14
+ ----------------------
15
+
16
+ ### pip install package
17
+ ```
18
+ !pip install datasets > delete.txt
19
+ ```
20
+
21
+ ### !!! Load Test Set -- Change the file path of test set
22
+ ```
23
+ import pandas as pd
24
+ df_test = pd.read_csv('/content/test_data.csv',index_col="Unnamed: 0")
25
+ df_test.head()
26
+ ```
27
+
28
+ ### Load Model from Hugging Face Hub (Don't change)
29
+ ```
30
+ from huggingface_hub import snapshot_download
31
+ import keras
32
+
33
+ # Download model from hugging face
34
+ local_path = snapshot_download(repo_id="HermesPenn/athena_model")
35
+
36
+ # Load model from local
37
+ model = keras.saving.load_model(local_path)
38
+ ```
39
+ ### Load Training set (Don't change)
40
+ ```
41
+ from datasets import load_dataset
42
+
43
+ dataset = load_dataset("HermesPenn/athena_data")
44
+ dataset = dataset['train']
45
+ data = dataset.to_pandas()
46
+ data.head()
47
+ ```
48
+
49
+ ### Fit_transform label_encoder and tokenizer (Don't change)
50
+ ```
51
+ from sklearn.preprocessing import LabelEncoder
52
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
53
+ from tensorflow.keras.preprocessing.text import Tokenizer
54
+ # Data preprocessing
55
+ le = LabelEncoder()
56
+ data['label'] = le.fit_transform(data['source'])
57
+ X = data['title']
58
+ y = data['label']
59
+
60
+ # Tokenize and pad text data
61
+ tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
62
+ tokenizer.fit_on_texts(X)
63
+ X_seq = tokenizer.texts_to_sequences(X)
64
+ X_padded = pad_sequences(X_seq, maxlen=200, padding='post', truncating='post')
65
+ ```
66
+
67
+ ### Test set Evaluation (Don't change)
68
+
69
+ ```
70
+ from sklearn.metrics import classification_report
71
+
72
+ X_test = df_test['title']
73
+ y_test = df_test['label']
74
+
75
+
76
+ X_test_seq = tokenizer.texts_to_sequences(X_test)
77
+ X_test_padded = pad_sequences(X_test_seq, maxlen=200, padding='post', truncating='post')
78
+
79
+ # Predict the labels using the model
80
+ y_pred_probs = model.predict(X_test_padded)
81
+ y_pred = (y_pred_probs > 0.5).astype(int)
82
+
83
+ # Evaluate the model
84
+ print("Classification Report:")
85
+ print(classification_report(y_test, y_pred))
86
+
87
+
88
+ try:
89
+ news_outlets = le.inverse_transform(y_pred.flatten()) # le must be pre-fitted
90
+ df_test['Predicted News Outlet'] = news_outlets
91
+ except NameError:
92
+ df_test['Predicted News Outlet'] = y_pred.flatten()
93
+ ```
94
+
95
+ ```
96
+ # Display test set with predictions
97
+ print("\nTest Set with Predictions:")
98
+
99
+ df_test[['title', 'News Outlet', 'Predicted News Outlet']]
100
+ ```