zhayunduo commited on
Commit
a966aee
1 Parent(s): 5d67bd5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +63 -41
README.md CHANGED
@@ -1,42 +1,64 @@
1
- ---
2
- license: apache-2.0
3
- ---
4
- ## A project by NUS ISS students Frank Cao, Gerong Zhang, Jiaqi Yao, Sikai Ni, Yunduo Zhang
5
-
6
- This model is fine tuned with roberta-base model on 3200000 comments from stocktwits, with the user labeled tags 'Bullish' or 'Bearish'
7
-
8
- ```python
9
- from transformers import RobertaForSequenceClassification, RobertaTokenizer
10
- from transformers import pipeline
11
- import pandas as pd
12
-
13
- # the model was trained upon below preprocessing
14
- def process_text(texts):
15
-
16
- # remove URLs
17
- texts = re.sub(r'https?://\S+', "", texts)
18
- texts = re.sub(r'www.\S+', "", texts)
19
- # remove '
20
- texts = texts.replace(''', "'")
21
- # remove symbol names
22
- texts = re.sub(r'(\#)(\S+)', r'hashtag_\2', texts)
23
- texts = re.sub(r'(\$)([A-Za-z]+)', r'cashtag_\2', texts)
24
- # remove usernames
25
- texts = re.sub(r'(\@)(\S+)', r'mention_\2', texts)
26
- # demojize
27
- texts = emoji.demojize(texts, delimiters=("", " "))
28
-
29
- return texts.strip()
30
-
31
- tokenizer_loaded = RobertaTokenizer.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')
32
- model_loaded = RobertaForSequenceClassification.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')
33
-
34
- nlp = pipeline("text-classification", model=model_loaded, tokenizer=tokenizer_loaded)
35
-
36
- sentences = pd.Series(['just buy','just sell it','entity rocket to the sky!','go down','even though it is going up, I still think it will not keep this trend in the near future'])
37
- # sentences = list(sentences.apply(process_text))
38
- sentences = list(sentences) # if input text contains https, @ or # or $ symbols, better apply preprocess to get a more accurate result
39
- results = nlp(sentences)
40
- print(results) # 2 labels, label 0 is bearish, label 1 is bullish
41
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  ```
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ ## **Sentiment Inferencing model for stock related commments**
6
+
7
+ ### A project by NUS ISS students Frank Cao, Gerong Zhang, Jiaqi Yao, Sikai Ni, Yunduo Zhang
8
+
9
+ <br />
10
+
11
+ ### Dataset
12
+
13
+ This model is fine tuned with roberta-base model on 3200000 comments from stocktwits, with the user labeled tags 'Bullish' or 'Bearish'
14
+ dataset link:
15
+
16
+ <br />
17
+
18
+ ### Training information
19
+ - batch size 32
20
+ - learning rate 2e-5
21
+
22
+ | Syntax | Train loss | Validation loss | Validation accuracy |
23
+ | ----------- | ----------- | ---------------- | ------------------- |
24
+ | epoch1 | 0.3495 | 0.2956 | 0.8679 |
25
+ | epoch2 | 0.2717 | 0.2235 | 0.9021 |
26
+ | epoch3 | 0.2360 | 0.1875 | 0.9210 |
27
+ | epoch4 | 0.2106 | 0.1603 | 0.9343 |
28
+
29
+ # How to use
30
+ ```python
31
+ from transformers import RobertaForSequenceClassification, RobertaTokenizer
32
+ from transformers import pipeline
33
+ import pandas as pd
34
+
35
+ # the model was trained upon below preprocessing
36
+ def process_text(texts):
37
+
38
+ # remove URLs
39
+ texts = re.sub(r'https?://\S+', "", texts)
40
+ texts = re.sub(r'www.\S+', "", texts)
41
+ # remove '
42
+ texts = texts.replace('&#39;', "'")
43
+ # remove symbol names
44
+ texts = re.sub(r'(\#)(\S+)', r'hashtag_\2', texts)
45
+ texts = re.sub(r'(\$)([A-Za-z]+)', r'cashtag_\2', texts)
46
+ # remove usernames
47
+ texts = re.sub(r'(\@)(\S+)', r'mention_\2', texts)
48
+ # demojize
49
+ texts = emoji.demojize(texts, delimiters=("", " "))
50
+
51
+ return texts.strip()
52
+
53
+ tokenizer_loaded = RobertaTokenizer.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')
54
+ model_loaded = RobertaForSequenceClassification.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')
55
+
56
+ nlp = pipeline("text-classification", model=model_loaded, tokenizer=tokenizer_loaded)
57
+
58
+ sentences = pd.Series(['just buy','just sell it','entity rocket to the sky!','go down','even though it is going up, I still think it will not keep this trend in the near future'])
59
+ # sentences = list(sentences.apply(process_text))
60
+ sentences = list(sentences) # if input text contains https, @ or # or $ symbols, better apply preprocess to get a more accurate result
61
+ results = nlp(sentences)
62
+ print(results) # 2 labels, label 0 is bearish, label 1 is bullish
63
+
64
  ```