khanfou commited on
Commit
c5aec4c
1 Parent(s): 500b120

Create FineTuning_Lang_models.py

Browse files
Files changed (1) hide show
  1. FineTuning_Lang_models.py +40 -0
FineTuning_Lang_models.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import streamlit as st
4
+ HG_DIR = '/nlp/scr/msuzgun/cache_extra/huggingface'
5
+ # Specify HG cache dirs -- currently use only for 2.7b model
6
+ os.environ['TRANSFORMERS_CACHE'] = f'{HG_DIR}/transformers'
7
+ os.environ['HF_HOME'] = HG_DIR
8
+
9
+ ## Import relevant libraries and dependencies
10
+ !pip install datasets
11
+ !pip install Transformers
12
+ !pip install streamlit
13
+ # Pretty print
14
+ from pprint import pprint
15
+ # Datasets load_dataset function
16
+ from datasets import load_dataset
17
+ # Transformers Autokenizer
18
+ from transformers import AutoTokenizer
19
+ tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
20
+ # Standard PyTorch DataLoader
21
+ from torch.utils.data import DataLoader
22
+
23
+
24
+
25
+ dataset_dict = load_dataset('HUPD/hupd',
26
+ name='sample',
27
+ data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
28
+ cache_dir ='/u/scr/nlp/data/HUPD',
29
+ icpr_label=None,
30
+ train_filing_start_date='2016-01-01',
31
+ train_filing_end_date='2016-01-31',
32
+ val_filing_start_date='2017-01-01',
33
+ val_filing_end_date='2017-01-31',
34
+ )
35
+
36
+ df = pd.DataFrame.from_dict(dataset_dict["train"])
37
+
38
+ # Create a DataFrame object from list
39
+ df = pd.DataFrame(df,columns =['patent_number','decision', 'abstract', 'claims','filing_date'])
40
+ st.dataframe(df)