DataAIDemo

Sleeping

App Files Files Community

themeetjani commited on Feb 2, 2024

Commit

6060e42

verified ·

1 Parent(s): 0e4ef18

Upload 10 files

Browse files

Files changed (10) hide show

pages/AI_Chatbot.py +15 -0
pages/Auto_Code_Generation.py +14 -0
pages/Auto_Report_Generation.py +14 -0
pages/Auto_Score_Generation.py +14 -0
pages/core_risk.py +135 -0
pages/jury_records.py +103 -0
pages/text_clustering.py +14 -0
pages/topic_classification.py +73 -0
pages/tweet_classification.py +30 -0
pages/untitled.txt +0 -0

pages/AI_Chatbot.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import streamlit as st
+st.set_page_config(
+    page_title="AI_Chatbot.py",
+    page_icon="👋",
+)
+st.write("# AI Chatbot! 👋")
+st.sidebar.success("Select a demo above.")
+st.markdown(
+    """
+    **Work in progress!!!** """
+)

pages/Auto_Code_Generation.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import streamlit as st
+st.set_page_config(
+    page_title="Auto_Code_Generation.py",
+    page_icon="👋",
+)
+st.write("# Auto Code Generation.! 👋")
+st.markdown(
+    """
+    **Work in progress!!!** """
+)

pages/Auto_Report_Generation.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import streamlit as st
+st.set_page_config(
+    page_title="Auto_Report_Generation.py",
+    page_icon="👋",
+)
+st.write("# Auto Report Generation.! 👋")
+st.markdown(
+    """
+    **Work in progress!!!** """
+)

pages/Auto_Score_Generation.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import streamlit as st
+st.set_page_config(
+    page_title="Auto_Score_Generation.py",
+    page_icon="👋",
+)
+st.write("# Auto Score Generation! 👋")
+st.markdown(
+    """
+    **Work in progress!!!** """
+)

pages/core_risk.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import numpy as np
+import torch
+import transformers
+import streamlit as st
+from streamlit import session_state
+import json
+import torch.nn.functional as F
+import boto3
+import pandas as pd
+bucket = 'data-ai-dev2'
+from transformers import BertTokenizer, BertModel
+from torch import cuda
+device = 'cuda' if cuda.is_available() else 'cpu'
+import numpy
+from numpy.random import seed
+seed(1)
+import emoji
+import string
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer  # PorterStemmer LancasterStemmer
+from nltk.stem import WordNetLemmatizer
+import re
+stemmer = PorterStemmer()
+# uncomment this when run first time
+nltk.download('wordnet')
+nltk.download('omw-1.4')
+nltk.download('stopwords')
+lemmatizer = WordNetLemmatizer()
+from transformers import pipeline
+stopwords = nltk.corpus.stopwords.words('english')
+model = 'C:/Users/Meet/Downloads/core_risk/models/'
+tokenizer = 'C:/Users/Meet/Downloads/core_risk/tokenizer/'
+from transformers import pipeline
+classifier = pipeline("text-classification", model= model,  tokenizer = tokenizer, truncation=True, max_length=512)
+def pre_processing_str_esg(df_col):
+    df_col = df_col.lower()
+    #defining the function to remove punctuation
+    def remove_punctuation(text):
+        punctuationfree="".join([i for i in text if i not in string.punctuation])
+        return punctuationfree
+    #storing the puntuation free text
+    df_col= remove_punctuation(df_col)
+    df_col = re.sub(r"http\S+", " ", df_col)
+    def remove_stopwords(text):
+        return " ".join([word for word in str(text).split() if word not in stopwords])
+    #applying the function
+    df_col = remove_stopwords(df_col)
+    df_col = re.sub('[%s]' % re.escape(string.punctuation), ' ' , df_col)
+    df_col = df_col.replace("¶", "")
+    df_col = df_col.replace("§", "")
+    df_col = df_col.replace('“', ' ')
+    df_col = df_col.replace('”', ' ')
+    df_col = df_col.replace('-', ' ')
+    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
+    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
+    df_col = REPLACE_BY_SPACE_RE.sub(' ',df_col)
+    df_col = BAD_SYMBOLS_RE.sub(' ',df_col)
+#     df_col = re.sub('W*dw*','',df_col)
+    df_col = re.sub('[0-9]+', ' ', df_col)
+    df_col = re.sub('  ', ' ', df_col)
+    def remove_emoji(string):
+        emoji_pattern = re.compile("["
+                               u"\U0001F600-\U0001F64F"  # emoticons
+                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+                               u"\U00002702-\U000027B0"
+                               u"\U000024C2-\U0001F251"
+                               "]+", flags=re.UNICODE)
+        return emoji_pattern.sub(r'', string)
+    df_col = remove_emoji(df_col)
+    return df_col
+def pre_processing_str(df_col):
+#    df_col = df_col.lower()
+    if len(df_col.split()) >= 70:
+        return pre_processing_str_esg(df_col)
+    else:
+        df_col = df_col.replace('#', '')
+        df_col = df_col.replace('!', '')
+        df_col = re.sub(r"http\S+", " ", df_col)
+        df_col = re.sub('[0-9]+', ' ', df_col)
+        df_col = re.sub('  ', ' ', df_col)
+        def remove_emojis(text):
+            return emoji.replace_emoji(text)
+        df_col = remove_emojis(df_col)
+        df_col = re.sub(r"(?:\@|https?\://)\S+", "", df_col)
+        df_col = re.sub(r"[^\x20-\x7E]+", "", df_col)
+        df_col = df_col.strip()
+        return df_col
+# start for the api steps make sure name should me match with file name and application = Flask(__name__). 'application.py and application
+def process(text):
+    text = pre_processing_str(text)
+    try:
+        if len(text) != 0:
+            results = classifier(text, top_k = 2)
+        else:
+            results = 'No Text'
+        return {'output_16':results}
+    except:
+        return {'output_16':'something went wrong'}
+st.set_page_config(page_title="core_risk", page_icon="📈")
+if 'topic_class' not in session_state:
+    session_state['topic_class']= ""
+st.title("Topic Classifier")
+text= st.text_area(label= "Please write the text bellow",
+              placeholder="What does the tweet say?")
+def classify(text):
+    session_state['topic_class'] = process(text)
+st.text_area("result", value=session_state['topic_class'])
+st.button("Classify", on_click=classify, args=[text])

pages/jury_records.py ADDED Viewed

	@@ -0,0 +1,103 @@

+#import the necessary packages
+import streamlit as st
+from streamlit import session_state
+from langchain.document_loaders import WebBaseLoader, PyPDFLoader, TextLoader
+from langchain.indexes import VectorstoreIndexCreator
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.docstore.document import Document
+import os
+from langchain.chat_models import ChatOpenAI
+import openai
+import json
+#need to set openai key or set it as a environment variable
+openai.api_key = "give api key"
+model = ChatOpenAI(model = 'gpt-4', max_tokens = 100,temperature=0)
+st.set_page_config(page_title="jury_records", page_icon="📈")
+# using this function to extract the content from the url. here we are using langchain webbaseloader to extract the content. We can use any web scrapping function also.
+def extract(link):
+    res = []
+    loader = WebBaseLoader(link)
+    pages = loader.load()
+    for i in pages:
+        res.append(i.page_content.replace('\n',''))
+    a = " ".join(res)
+    print(len(a))
+    if len(a)>0:
+        return a
+    else:
+        return 'error'
+#Summarize the content with use of gpt4 with prompting.
+def summarize(link):
+    context = extract(link)
+    if context!='error':
+        #print(context)
+        response = openai.ChatCompletion.create(
+          model="gpt-4",
+          messages=[
+            {
+              "role": "system",
+              "content": f"Following context is given.{context}" },
+            {
+              "role": "user",
+              "content": '''Summarize the content in detail. Follow these instructions while summarizing.\n Include case no.\n Include all Plaintiff. \n Include the court name.
+              \n Alias name should be included.\n Include case no. \n Include all defendants.\n If place is mentioned then include it, otherwise don't include it.
+              \n Date format should be dd/mm/yyyy.\n If case is settled for an amount then try to include the amount.
+              If amount is not mentioned don't mentioned anything about the same. only include this line if case is
+              setteled otherwise include the status of case.\n\n<<REMEMBER>>\n\n Please try to include all the details. Don't leave out any information.'''
+            }
+          ],
+          temperature=0,
+          max_tokens=1000,
+          top_p=1,
+          frequency_penalty=0,
+          presence_penalty=0
+        )
+        return response.choices[0].message.content.strip()
+    else:
+        return 'error'
+# Passing these questions dictinary for qna. there are lot of iterations has been done and this is final questions dictionary that we have come up with. you can change this dictionary based on input parameters those needs to be extracted from url.
+info_detail = {'case_type':'provide case type or court system like "Criminal", "Family Law", "labour law"',
+               'name_of_court': 'provide name of court or jail or court record.',
+               'case_number': 'provide case number or country case number or bankrupty case number', 'date_filed': 'what is the date when the case was filed or the date when case first formally/officially submitted?',
+               'plaintiff': 'Names of the Petitioner or plaintiff or applicant? ',
+                'defendants': "Names of all defendants, respondent and alias. Name entity under 'Defendants'",
+               'nature_of_action': 'Summarize the reason behind the case within 20 words in detail',
+               'status': 'what is the status of case?'}
+#langchain function for qna over the summary extracted from gpt4. vector database concept has been adopted.
+def lang(context):
+    answer_dict={}
+    docs =  Document(page_content=context)
+    index2 = VectorstoreIndexCreator().from_documents([docs])
+    for key in info_detail:
+        ques = info_detail[key]
+        answer_dict[key] = index2.query(llm = model, question = ques)
+    index2.vectorstore.delete_collection()
+    return answer_dict
+def process(url):
+    try:
+        summary = summarize(url)
+        if summary == 'error':
+            return {"details":"","status":False}
+        else:
+            answer_dict = lang(summary)
+            return answer_dict
+    except:
+        return "Please try again"
+if 'jury_records_dict' not in session_state:
+    session_state['jury_records_dict']= ""
+def Jury(url):
+    session_state['jury_records_dict']= process(jury_url)
+st.title("Jury Records")
+jury_url= st.text_area(label= "Please enter the jury records link",
+              placeholder="Jury records Link")
+st.text_area("result", value=session_state['jury_records_dict'])
+st.button("Get answer dictionary", on_click=Jury, args=[jury_url])

pages/text_clustering.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import streamlit as st
+st.set_page_config(
+    page_title="text_clustering.py",
+    page_icon="👋",
+)
+st.write("# Text Clustering.! 👋")
+st.markdown(
+    """
+    **Work in progress!!!** """
+)

pages/topic_classification.py ADDED Viewed

	@@ -0,0 +1,73 @@

+#importing all the neccesary packages here
+import streamlit as st
+from streamlit import session_state
+import pandas as pd
+import numpy as np
+from scipy import spatial
+from sentence_transformers import SentenceTransformer
+import json
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #calling hugging face model for embeddings here
+#cosine function for
+def cosine_similarity(x,y):
+    return 1 - spatial.distance.cosine(x,y)
+# reading topic file into dataframe
+df = pd.read_excel(r'C:\Users\Meet\Downloads/topic_data.xlsx')
+#df2 = pd.read_csv("BBC News Train.csv") #sample news article file
+#storing level1 and level2 segments into dictinary first
+result_dict = df.groupby('LEVEL 1')['new_level_2'].apply(list).to_dict()
+#storing l1 segments
+segments = list(result_dict.keys())
+segments_encode = model.encode(segments) #encoding l1 segments with model
+#creating embedding dictionary of all l1 segments and l2 segments.
+#embedding dictionary for l2 segments
+embeddings_dict = {}
+for key, val in result_dict.items():
+    embed = model.encode(result_dict[key])
+    embeddings_dict[key] = embed
+#function for calculating l1 segments.
+def segments_finder(text_encode):
+    score_dict = {}
+    for segment,name in zip(segments_encode,segments):
+        similarity_score = cosine_similarity(segment,text_encode)
+        score_dict[name] = similarity_score
+    return sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
+def level2(article_summary):
+    l1 = {}
+    l2 = {}
+    output = {}
+    text_encode = model.encode(article_summary)
+    l1_pred = segments_finder(text_encode)
+    #iterating in l1 segments to find their l2 segments.
+    for i in l1_pred[:2]:
+        score_dict = {}
+        l2_segments = result_dict[i[0]]
+        l2_segments_encode = embeddings_dict[i[0]]
+        for segment,name in zip(l2_segments_encode,l2_segments):
+            similarity_score = cosine_similarity(segment,text_encode)
+            score_dict[name] = similarity_score
+        l2_pred = dict(list(sorted(score_dict.items(), key=lambda x: x[1], reverse=True))[:2])
+        print(l2_pred)
+        l2[i[0]] = l2_pred
+    output['l1'] = dict(list(sorted(dict(l1_pred).items(), key=lambda x: x[1], reverse=True))[:2])
+    output['l2'] = l2
+    return output
+st.set_page_config(page_title="topic_classification", page_icon="📈")
+if 'topic_class' not in session_state:
+    session_state['topic_class']= ""
+st.title("Topic Classifier")
+text= st.text_area(label= "Please write the text bellow",
+              placeholder="What does the tweet say?")
+def classify(text):
+    session_state['topic_class'] = level2(text)
+st.text_area("result", value=session_state['topic_class'])
+st.button("Classify", on_click=classify, args=[text])

pages/tweet_classification.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import streamlit as st
+from streamlit import session_state
+# Load model directly
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from transformers import pipeline
+tokenizer = AutoTokenizer.from_pretrained("themeetjani/tweet-classification")
+model = AutoModelForSequenceClassification.from_pretrained("themeetjani/tweet-classification")
+classifier = pipeline("text-classification", model= model,  tokenizer = tokenizer, truncation=True, max_length=512)
+st.set_page_config(page_title="Classification", page_icon="📈")
+if 'tweet_class' not in session_state:
+    session_state['tweet_class']= ""
+def classify(tweet):
+    predicted_classes= session_state['tweet_class']= classifier(tweet, top_k=1)
+    print (tweet)
+    print (predicted_classes)
+    session_state['tweet_class'] = predicted_classes[0]['label']
+st.title("Tweet Classifier")
+tweet= st.text_area(label= "Please write the tweet bellow",
+              placeholder="What does the tweet say?")
+st.text_area("result", value=session_state['tweet_class'])
+st.button("Classify", on_click=classify, args=[tweet])

pages/untitled.txt ADDED Viewed

File without changes