|
import os |
|
import pandas as pd |
|
import streamlit as st |
|
HG_DIR = '/nlp/scr/msuzgun/cache_extra/huggingface' |
|
|
|
os.environ['TRANSFORMERS_CACHE'] = f'{HG_DIR}/transformers' |
|
os.environ['HF_HOME'] = HG_DIR |
|
|
|
|
|
|
|
|
|
from pprint import pprint |
|
|
|
from datasets import load_dataset |
|
|
|
|
|
|
|
|
|
from torch.utils.data import DataLoader |
|
|
|
|
|
|
|
dataset_dict = load_dataset('HUPD/hupd', |
|
name='sample', |
|
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", |
|
cache_dir ='/u/scr/nlp/data/HUPD', |
|
icpr_label=None, |
|
train_filing_start_date='2016-01-01', |
|
train_filing_end_date='2016-01-31', |
|
val_filing_start_date='2017-01-01', |
|
val_filing_end_date='2017-01-31', |
|
) |
|
|
|
df = pd.DataFrame.from_dict(dataset_dict["train"]) |
|
|
|
|
|
df = pd.DataFrame(df,columns =['patent_number','decision', 'abstract', 'claims','filing_date']) |
|
st.dataframe(df) |
|
|