maxmon commited on
Commit
1086ffd
·
1 Parent(s): 1743235

feat: init

Browse files
app.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+
4
+ def auto_anno(txt, types, radio, need_trans=False):
5
+ if need_trans:
6
+ txt = en2cn(txt)
7
+ if radio == '文本分类':
8
+ result = text_classification(txt, types)
9
+ if radio == '实体抽取':
10
+ result = extract_named_entities(txt, types)
11
+ if need_trans:
12
+ result = f'{txt}\n{result}'
13
+ return result
14
+
15
+ input1 = gr.Textbox(lines=3, label="输入原句")
16
+ input2 = gr.Textbox(lines=3, label="输入类别")
17
+ output = gr.Textbox(label="输出结果")
18
+ radio = gr.Radio(["文本分类", "实体抽取"], label="算法类型")
19
+ checkbox = gr.Checkbox(label="翻译成中文")
20
+
21
+ # 读取数据
22
+ from utils.anno.cls.text_classification import text_classification
23
+ from utils.anno.ner.entity_extract import extract_named_entities
24
+ from utils.api.google_trans import en2cn
25
+
26
+ if __name__ == '__main__':
27
+ # # 多文本分类
28
+ # txts = open('data/cls/jd.csv', 'r', encoding='utf-8').read().split('\n')[1:]
29
+ # txts = [txt.split(',')[0] for txt in txts if txt != '']
30
+
31
+ # results = []
32
+ # for txt in txts:
33
+ # results.append(text_classification(txt, ['好评', '差评']))
34
+ demo = gr.Interface(fn=auto_anno, inputs=[input1, input2, radio, checkbox], outputs=[output])
35
+ demo.launch(share=True)
local_config.py ADDED
@@ -0,0 +1 @@
 
 
1
+ openai_key = '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'
utils/anno/cls/__pycache__/text_classification.cpython-310.pyc ADDED
Binary file (1.25 kB). View file
 
utils/anno/cls/text_classification.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import sys
3
+ sys.path.append('.')
4
+ from local_config import openai_key
5
+
6
+ # Set up your API key
7
+ openai.api_key = openai_key
8
+
9
+ def text_classification(src_txt, type_arr):
10
+ system = f"你是一个聪明而且有百年经验的文本. 你的任务是从一段文本里面提取出相应的分类结果签。你的回答必须用统一的格式。文本用```符号分割。分类类型保存在一个数组里{type_arr}"
11
+ user = f"输入|```这个商品真垃圾```输出|"
12
+ assistant = "差评"
13
+ input = f"输入|```{src_txt}```输出|"
14
+ # Call the OpenAI API
15
+ completion = openai.ChatCompletion.create(
16
+ model="gpt-3.5-turbo",
17
+ messages=[
18
+ {"role": "system", "content": f"{system}"},
19
+ {"role": "user", "content": f"{user}"},
20
+ {"role": "assistant", "content": f"{assistant}"},
21
+ {"role": "user", "content": f"{input}"}
22
+ ]
23
+ )
24
+
25
+ # Extract the output and parse the JSON array
26
+ content = completion.choices[0].message.content
27
+ return content
28
+
29
+ if __name__ == '__main__':
30
+ type_arr = ['好评', '差评']
31
+ txts = [
32
+ '这个商品真不错',
33
+ '用着不行',
34
+ '没用过这么好的东西'
35
+ ]
36
+ for txt in txts:
37
+ result = text_classification(txt, type_arr)
38
+ print(txt, result)
utils/anno/ner/__pycache__/entity_extract.cpython-310.pyc ADDED
Binary file (1.47 kB). View file
 
utils/anno/ner/entity_extract.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import json
3
+ import sys
4
+ sys.path.append('.')
5
+ from local_config import openai_key
6
+
7
+ # Set up your API key
8
+ openai.api_key = openai_key
9
+
10
+ def extract_named_entities(src_txt, type_arr):
11
+ system = f"你是一个聪明而且有百年经验的命名实体识别(NER)识别器. 你的任务是从一段文本里面提取出相应的实体并且给出标签。你的回答必须用统一的格式。文本用```符号分割。输出采用Json的格式并且标记实体在文本中的位置。实体类型保存在一个数组里{type_arr}"
12
+ user = f"输入|```皮卡丘神奇宝贝```输出|"
13
+ assistant = """[{"name": "皮卡丘", "type": "Person", "start": 0, "end": 3}, {"name": "神奇宝贝", "type": "物种", "start": 4, "end": 8}]"""
14
+ input = f"输入|```{src_txt}```输出|"
15
+ # Call the OpenAI API
16
+ completion = openai.ChatCompletion.create(
17
+ model="gpt-3.5-turbo",
18
+ messages=[
19
+ {"role": "system", "content": f"{system}"},
20
+ {"role": "user", "content": f"{user}"},
21
+ {"role": "assistant", "content": f"{assistant}"},
22
+ {"role": "user", "content": f"{input}"}
23
+ ]
24
+ )
25
+
26
+ # Extract the output and parse the JSON array
27
+ content = completion.choices[0].message.content
28
+ print(content)
29
+ j = json.loads(content)
30
+ return j
31
+
32
+ if __name__ == '__main__':
33
+ # extract_named_entities("```汤姆每天都被杰瑞欺负,皮卡丘越来越想帮忙,竟然还总是被拒绝,心想难道我“皮大仙”这点能力都没有?而且,这货不是被虐狂吧```", ["Person", "物种"])
34
+ extract_named_entities('老百姓心新乡新闻网话说这几天新乡天气还好吧偷笑', ['代称', '行政区'])
35
+ # Tags: PER(人名), LOC(地点名), GPE(行政区名), ORG(机构名)
36
+ # Label Tag Meaning
37
+ # PER PER.NAM 名字(张三)
38
+ # PER.NOM 代称、类别名(穷人)
39
+ # LOC LOC.NAM 特指名称(紫玉山庄)
40
+ # LOC.NOM 泛称(大峡谷、宾馆)
41
+ # GPE GPE.NAM 行政区的名称(北京)
42
+ # ORG ORG.NAM 特定机构名称(通惠医院)
43
+ # ORG.NOM 泛指名称、统称(文艺公司)
44
+ # 原始标注 老百姓PER.NOM 新乡GPE.NAM
45
+ # gpt-3.5-turbo [{"name": "老百姓", "type": "代称", "start": 0, "end": 4}, {"name": "新乡新闻网", "type": "组织机构", "start": 4, "end": 10}, {"name": "新乡", "type": "行政区", "start": 12, "end": 14}, {"name": "天气", "type": "自然现象", "start": 14, "end": 16}]
46
+ # ERNIE-UIE {"text":"老百姓心新乡新闻网话说这几天新乡天气还好吧偷笑","result":[{"行政区":[{"text":"新乡","start":4,"end":6,"probability":0.589552328738506}]}]}
47
+
utils/api/__pycache__/google_trans.cpython-310.pyc ADDED
Binary file (889 Bytes). View file
 
utils/api/chatglm.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ # 页面地址 https://fd7fa865d3f27cda69.gradio.live/
4
+ # 指定请求的数据
5
+ data = {'prompt': '清华大学地址'}
6
+ # 发送POST请求到API
7
+ response = requests.post('http://region-9.seetacloud.com:51661/', json=data)
8
+ # 获取预测结果
9
+ result = response.json()
10
+ print(result)
utils/api/google_trans.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+
4
+ def en2cn(text):
5
+ return trans(text, 'en', 'zh-CN')
6
+
7
+ def trans(text, sl, tl):
8
+ temp_url = 'https://translate.googleapis.com/translate_a/single?client=gtx&sl={sl}&tl={tl}&dt=t&q={q}'
9
+ url = temp_url.format(q=text, sl=sl, tl=tl)
10
+ result = requests.get(url)
11
+ j = json.loads(result.content)
12
+ cn = ''.join([i[0] for i in j[0]])
13
+ return cn
14
+
15
+ if __name__ == '__main__':
16
+ print(en2cn('hello world'))
utils/auto_learn/cluster_text.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ from sklearn.decomposition import PCA
3
+ from sklearn.cluster import KMeans
4
+ from sklearn.metrics.pairwise import euclidean_distances
5
+ import openai
6
+ import numpy as np
7
+ # import matplotlib
8
+ # print(matplotlib.matplotlib_fname())
9
+ import sys
10
+ sys.path.append('.')
11
+ from local_config import openai_key
12
+
13
+
14
+ def cluster_text(text_list, n_clusters=20, openai_api_key=openai_key):
15
+ # Set OpenAI API key
16
+ openai.api_key = openai_api_key
17
+ model = "text-embedding-ada-002"
18
+ # Convert text_list to numerical data using OpenAI API
19
+ data = []
20
+ for text in text_list:
21
+ emb_req = openai.Embedding.create(input=[text], model=model)
22
+ embeddings = emb_req.data[0].embedding
23
+ data.append(embeddings)
24
+ data = np.array(data)
25
+
26
+ # Cluster the data
27
+ kmeans = KMeans(n_clusters=n_clusters)
28
+ kmeans.fit(data)
29
+
30
+ # Get the cluster centers
31
+ centers = kmeans.cluster_centers_
32
+
33
+ # Get the distances to each center
34
+ # distances = kmeans.transform(data)
35
+ distances = euclidean_distances(data, centers)
36
+
37
+ # Get the indices of the samples with the largest distance to their center
38
+ indices = np.argmax(distances, axis=0)
39
+
40
+ # Get the samples with the largest distance to their center
41
+ samples = []
42
+ seen_samples = set()
43
+ for i in indices:
44
+ sample = text_list[i]
45
+ if sample not in seen_samples:
46
+ samples.append(sample)
47
+ seen_samples.add(sample)
48
+ else:
49
+ sorted_indices = np.argsort(distances[:, i])
50
+ for j in sorted_indices[::-1]:
51
+ sample = text_list[j]
52
+ if sample not in seen_samples:
53
+ samples.append(sample)
54
+ seen_samples.add(sample)
55
+ break
56
+
57
+ # Return samples as list of strings
58
+ return samples
59
+
60
+
61
+ def plot_clusters(text_list, n_clusters=20, openai_api_key=openai_key):
62
+ # Set OpenAI API key
63
+ openai.api_key = openai_api_key
64
+ model = "text-embedding-ada-002"
65
+ # Convert text_list to numerical data using OpenAI API
66
+ data = []
67
+ for text in text_list:
68
+ emb_req = openai.Embedding.create(input=[text], model=model)
69
+ embeddings = emb_req.data[0].embedding
70
+ data.append(embeddings)
71
+ data = np.array(data)
72
+
73
+ # Cluster the data
74
+ kmeans = KMeans(n_clusters=n_clusters)
75
+ kmeans.fit(data)
76
+
77
+ # Reduce the dimensionality of the data
78
+ pca = PCA(n_components=2)
79
+ reduced_data = pca.fit_transform(data)
80
+
81
+ # Plot the reduced data
82
+ plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=kmeans.labels_)
83
+ for i, text in enumerate(text_list):
84
+ plt.annotate(text, (reduced_data[i, 0], reduced_data[i, 1]))
85
+ plt.show()
86
+
87
+
88
+ if __name__ == "__main__":
89
+ test_data = [
90
+ '一百多和三十的也看不出什么区别,包装精美,质量应该不错。',
91
+ '质量很好 料子很不错 做工细致 样式好看 穿着很漂亮',
92
+ ' 会卷的 建议买大的小的会卷 胖就别买了 没用',
93
+ '大差了 布料很差 我也不想多说',
94
+ '一点也不好,我买的东西拿都拿到快递员自己签收了还不给我,恶心恶心恶心,不要脸不要脸'
95
+ ]
96
+
97
+ result = cluster_text(test_data, n_clusters=3)
98
+ plot_clusters(test_data, n_clusters=3)
99
+
100
+ print(result)
utils/format/bio_2_json.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def bio_2_json_one(anno_txt):
2
+ ls = anno_txt.split('\n')
3
+ text = ''
4
+ anno = []
5
+ now_label = ''
6
+ for i, l in enumerate(ls):
7
+ char, label = l.split('\t')
8
+ text += char
9
+ if 'B-' in label:
10
+ start = i
11
+ now_label = label.split('-')[1]
12
+ if label == 'O':
13
+ if now_label:
14
+ anno.append([start, i, text[start:i], now_label])
15
+ now_label = ''
16
+ start = 0
17
+ if now_label:
18
+ i += 1
19
+ anno.append([start, i, text[start:i], now_label])
20
+ return {'text': text, 'anno': anno}
21
+
22
+
23
+ def bit_2_json(txt):
24
+ anno_txts = txt.split('\n\n')
25
+ annos = []
26
+ for anno_txt in anno_txts:
27
+ if anno_txt == '':
28
+ continue
29
+ anno_j = bio_2_json_one(anno_txt)
30
+ annos.append(anno_j)
31
+ return annos
32
+
33
+
34
+ if __name__ == '__main__':
35
+ txt = '''你\tB-PER
36
+ 是\tO
37
+ 一\tO
38
+ 个\tO
39
+ 聪\tB-PER
40
+ 明\tI-PER
41
+ 的\tO
42
+ 软\tB-ORG
43
+ 件\tI-ORG
44
+ 工\tI-ORG
45
+ 程\tI-ORG
46
+ 师\tI-ORG'''
47
+ # txt = open('data/ner/weibo_ner/dev.txt', 'r', encoding='utf-8').read()
48
+ annos = bit_2_json(txt)
49
+ print(annos)