#!/usr/bin/env python # coding: utf-8 # In[ ]: import PyPDF2 import jieba import jieba.posseg as pseg from jieba import analyse import gradio as gr import numpy as np import os def countIDF(text,topK): tfidf = analyse.extract_tags cipin = {} fenci = jieba.cut(text) for word in fenci: if word not in cipin.keys(): cipin[word] = 0 cipin[word] += 1 keywords = tfidf(text,topK,withWeight=True) ans = [] for keyword in keywords: ans.append(cipin[keyword[0]]) return ans def pers_sim(a,b): a = np.array(a) b = np.array(b) a = a - np.average(a) b = b - np.average(b) return np.sum(a*b) / (np.sqrt(np.sum(a**2))*np.sqrt(np.sum(b**2))) def splitWord_PersionSimlaryty(str_a,str_b,topK=20,sim=pers_sim): vec_a = countIDF(str_a,topK) vec_b = countIDF(str_b,topK) return sim(vec_a,vec_b) def similarity(A,B): text=[] read_pdf = PyPDF2.PdfFileReader(A) number_of_pages = read_pdf.getNumPages() for i in range(number_of_pages-1): page = read_pdf.pages[i] page_content = page.extractText() text.append(page_content) str_L=' ' for t in text: str_L+=t+' ' text=[] read_pdf = PyPDF2.PdfFileReader(B) number_of_pages = read_pdf.getNumPages() for i in range(number_of_pages-1): page = read_pdf.pages[i] page_content = page.extractText() text.append(page_content) str_Y=' ' for t in text: str_Y+=t+' ' return "論文相似度: "+str(round(splitWord_PersionSimlaryty(str_L,str_Y)*100,2))+"%" title="Paper Similarity 論文相似度比較" description=''' National Taiwan University on Tuesday (August 9) announced a decision to rescind a master's degree it gave to Lin Chih-chien (林智堅) in 2017, citing plagiarism after a meeting by the school's academic ethics committee. "The act sullied the reputation of National Taiwan University...and the school will reinforce the importance of academic integrity and ethics, not letting it happen again." With this in mind, we proposed machine learning method to analyze the similarity between 2 papers. Provide an objective indicator for your reference. 台大周二(8月9日)宣布撤銷2017年授予林智堅的碩士學位,理由是該校學術倫理委員會開會後認為存在抄襲。 “該行為玷污了台大的聲譽……學校將加強學術誠信和道德的重要性,不會讓這種事再次發生。” 考慮到這一點,我們提出了機器學習方法來分析兩篇論文之間的相似性。 提供一個客觀的指標供您參考。