#!/usr/bin/env python
# coding: utf-8

# In[ ]:


import PyPDF2
import jieba
import jieba.posseg as pseg
from jieba import analyse
import gradio as gr
import numpy as np
import os

def countIDF(text,topK):
    tfidf = analyse.extract_tags
    cipin = {}
    fenci = jieba.cut(text)
    for word in fenci:
        if word not in cipin.keys():
            cipin[word] = 0
        cipin[word] += 1
    keywords = tfidf(text,topK,withWeight=True)
    ans = []
    for keyword in keywords:
        ans.append(cipin[keyword[0]])
    return ans

def pers_sim(a,b):
    a = np.array(a)
    b = np.array(b)
    a = a - np.average(a)
    b = b - np.average(b)
    return np.sum(a*b) / (np.sqrt(np.sum(a**2))*np.sqrt(np.sum(b**2)))

def splitWord_PersionSimlaryty(str_a,str_b,topK=20,sim=pers_sim):
    vec_a = countIDF(str_a,topK)
    vec_b = countIDF(str_b,topK)
    return sim(vec_a,vec_b)

def similarity(A,B):
    text=[]
    read_pdf = PyPDF2.PdfFileReader(A)
    number_of_pages = read_pdf.getNumPages()
    for i in range(number_of_pages-1):
        page = read_pdf.pages[i]
        page_content = page.extractText()
        text.append(page_content)
    str_L=' '
    for t in text:
        str_L+=t+' '

    text=[]
    read_pdf = PyPDF2.PdfFileReader(B)
    number_of_pages = read_pdf.getNumPages()
    for i in range(number_of_pages-1):
        page = read_pdf.pages[i]
        page_content = page.extractText()
        text.append(page_content)
    str_Y=' '
    for t in text:
        str_Y+=t+' '
    
    return "論文相似度: "+str(round(splitWord_PersionSimlaryty(str_L,str_Y)*100,2))+"%"

title="Paper Similarity 論文相似度比較"
description='''
National Taiwan University on Tuesday (August 9) announced a decision to rescind a master's degree it gave to Lin Chih-chien (林智堅) in 2017, citing plagiarism after a meeting by the school's academic ethics committee.
"The act sullied the reputation of National Taiwan University...and the school will reinforce the importance of academic integrity and ethics, not letting it happen again."
With this in mind, we proposed machine learning method to analyze the similarity between 2 papers. Provide an objective indicator for your reference.

台大周二（8月9日）宣布撤銷2017年授予林智堅的碩士學位，理由是該校學術倫理委員會開會後認為存在抄襲。
“該行為玷污了台大的聲譽……學校將加強學術誠信和道德的重要性，不會讓這種事再次發生。”
考慮到這一點，我們提出了機器學習方法來分析兩篇論文之間的相似性。 提供一個客觀的指標供您參考。

<th>
<iframe width="560" height="315" src="https://www.youtube.com/embed/TQNzsQ6I69k" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
</th>

### Please upload 2 papers below, the format is limited to pdf
### 以下請輸入2篇論文, 格式限定pdf

'''

demo = gr.Interface(similarity,["file", "file"],outputs='text',title=title,description=description).launch()