Spaces:
Sleeping
Sleeping
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[ ]: | |
import PyPDF2 | |
import jieba | |
import jieba.posseg as pseg | |
from jieba import analyse | |
import gradio as gr | |
import numpy as np | |
import os | |
def countIDF(text,topK): | |
tfidf = analyse.extract_tags | |
cipin = {} | |
fenci = jieba.cut(text) | |
for word in fenci: | |
if word not in cipin.keys(): | |
cipin[word] = 0 | |
cipin[word] += 1 | |
keywords = tfidf(text,topK,withWeight=True) | |
ans = [] | |
for keyword in keywords: | |
ans.append(cipin[keyword[0]]) | |
return ans | |
def pers_sim(a,b): | |
a = np.array(a) | |
b = np.array(b) | |
a = a - np.average(a) | |
b = b - np.average(b) | |
return np.sum(a*b) / (np.sqrt(np.sum(a**2))*np.sqrt(np.sum(b**2))) | |
def splitWord_PersionSimlaryty(str_a,str_b,topK=20,sim=pers_sim): | |
vec_a = countIDF(str_a,topK) | |
vec_b = countIDF(str_b,topK) | |
return sim(vec_a,vec_b) | |
def similarity(A,B): | |
text=[] | |
read_pdf = PyPDF2.PdfFileReader(A) | |
number_of_pages = read_pdf.getNumPages() | |
for i in range(number_of_pages-1): | |
page = read_pdf.pages[i] | |
page_content = page.extractText() | |
text.append(page_content) | |
str_L=' ' | |
for t in text: | |
str_L+=t+' ' | |
text=[] | |
read_pdf = PyPDF2.PdfFileReader(B) | |
number_of_pages = read_pdf.getNumPages() | |
for i in range(number_of_pages-1): | |
page = read_pdf.pages[i] | |
page_content = page.extractText() | |
text.append(page_content) | |
str_Y=' ' | |
for t in text: | |
str_Y+=t+' ' | |
return "論文相似度: "+str(round(splitWord_PersionSimlaryty(str_L,str_Y)*100,2))+"%" | |
title="Paper Similarity 論文相似度比較" | |
description=''' | |
National Taiwan University on Tuesday (August 9) announced a decision to rescind a master's degree it gave to Lin Chih-chien (林智堅) in 2017, citing plagiarism after a meeting by the school's academic ethics committee. | |
"The act sullied the reputation of National Taiwan University...and the school will reinforce the importance of academic integrity and ethics, not letting it happen again." | |
With this in mind, we proposed machine learning method to analyze the similarity between 2 papers. Provide an objective indicator for your reference. | |
台大周二(8月9日)宣布撤銷2017年授予林智堅的碩士學位,理由是該校學術倫理委員會開會後認為存在抄襲。 | |
“該行為玷污了台大的聲譽……學校將加強學術誠信和道德的重要性,不會讓這種事再次發生。” | |
考慮到這一點,我們提出了機器學習方法來分析兩篇論文之間的相似性。 提供一個客觀的指標供您參考。 | |
<th> | |
<iframe width="560" height="315" src="https://www.youtube.com/embed/TQNzsQ6I69k" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> | |
</th> | |
### Please upload 2 papers below, the format is limited to pdf | |
### 以下請輸入2篇論文, 格式限定pdf | |
''' | |
demo = gr.Interface(similarity,["file", "file"],outputs='text',title=title,description=description).launch() |