qingxu99 commited on
Commit
8d52819
1 Parent(s): 3951159
Files changed (1) hide show
  1. multi_language.py +17 -13
multi_language.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import functools
3
- import os
4
  import pickle
5
  import time
6
 
@@ -79,22 +79,26 @@ def lru_file_cache(maxsize=128, ttl=None, filename=None):
79
  return decorator_function
80
 
81
 
 
 
 
 
 
 
82
 
83
  def extract_chinese_characters(file_path):
 
84
  with open(file_path, 'r', encoding='utf-8') as f:
85
  content = f.read()
86
- chinese_characters = []
87
- sentence = {'file':file_path, 'begin':-1, 'end':-1, 'word': ""}
88
- for index, char in enumerate(content):
89
- if 0x4e00 <= ord(char) <= 0x9fff:
90
- sentence['word'] += char
91
- if sentence['begin'] == -1: sentence['begin'] = index
92
- sentence['end'] = index
93
- else:
94
- if len(sentence['word'])>0:
95
- chinese_characters.append(sentence)
96
- sentence = {'file':file_path, 'begin':-1, 'end':-1, 'word': ""}
97
- return chinese_characters
98
 
99
  def extract_chinese_characters_from_directory(directory_path):
100
  chinese_characters = []
 
1
  import os
2
  import functools
3
+ import re
4
  import pickle
5
  import time
6
 
 
79
  return decorator_function
80
 
81
 
82
+ def contains_chinese(string):
83
+ """
84
+ Returns True if the given string contains Chinese characters, False otherwise.
85
+ """
86
+ chinese_regex = re.compile(u'[\u4e00-\u9fff]+')
87
+ return chinese_regex.search(string) is not None
88
 
89
  def extract_chinese_characters(file_path):
90
+ syntax = []
91
  with open(file_path, 'r', encoding='utf-8') as f:
92
  content = f.read()
93
+ import ast
94
+ root = ast.parse(content)
95
+ for node in ast.walk(root):
96
+ if isinstance(node, ast.Name):
97
+ if contains_chinese(node.id):
98
+ print(node.id)
99
+ syntax.append(node)
100
+
101
+ return syntax
 
 
 
102
 
103
  def extract_chinese_characters_from_directory(directory_path):
104
  chinese_characters = []