File size: 3,387 Bytes
24543e1
 
 
 
 
 
 
 
2f27b07
24543e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f6fbd9
 
6d77e76
6b5db2e
 
24543e1
 
 
 
00cdc62
24543e1
 
 
 
 
 
13708b7
50a86a7
22771ed
77a5389
50a86a7
24543e1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from transformers import AutoTokenizer, AutoModelWithLMHead, SummarizationPipeline
import gradio as gr
import tokenize
import io

pipeline = SummarizationPipeline(
    model=AutoModelWithLMHead.from_pretrained("SEBIS/code_trans_t5_large_source_code_summarization_python_multitask_finetune"),
    tokenizer=AutoTokenizer.from_pretrained("SEBIS/code_trans_t5_large_source_code_summarization_python_multitask_finetune", skip_special_tokens=True),
    device=-1)
    
def code_summarizer(code):
  def code_tokenizer(line):
    result= []
    line = io.StringIO(line) 
    
    for toktype, tok, start, end, line in tokenize.generate_tokens(line.readline):
      if (not toktype == tokenize.COMMENT):
        if toktype == tokenize.STRING:
          result.append("CODE_STRING")
        elif toktype == tokenize.NUMBER:
          result.append("CODE_INTEGER")
        elif (not tok=="\n") and (not tok=="    "):
          result.append(str(tok))
    return ' '.join(result)
    
  tokenized_code = code_tokenizer(code)
  summary = pipeline(tokenized_code)
  return summary[0]['summary_text']

def call_examples():
  examples = [['''def findAverage(list): sum = 0. for x in list: sum = sum + x average = sum / len(list) return average'''],
              ['''def findMax(list): max = list[0] for x in list: if x > max: max = x return max'''],
              ['''def findRange(list): return max(list)-min(list)'''],
              ['''def rng(): return random.randint(0,9)'''],
              ['''def search(arr, low, high, x): if high >= low: mid = (high + low) // 2 if arr[mid] == x: return mid elif arr[mid] > x: return search(arr, low, mid - 1, x) else: return search(arr, mid + 1, high, x) else: return -1''']]
  return examples

gr.Interface(fn=code_summarizer,
             inputs=gr.inputs.Textbox(
                 lines=10,
                 default='',
                 placeholder='Insert a Python code here',
                 label='PYTHON CODE'),
             outputs=gr.outputs.Textbox(
                 type='auto',
                 label='CODE SUMMARY'),
             title='Python Code Summarizer From CodeTrans',
             description='Summarize any Python source code',
             article='''<h2><b>Additional Information</b></h2><p style='text-align: justify'>This CodeTrans model is based on the T5-large model architecture pretrained on the Python programming language. It is trained on tokenized python code functions, which is why it works best with tokenized python functions. It has its own SentencePiece vocabulary model. It used multi-task training on 13 supervised tasks in the software development domain and 7 unsupervised datasets. It is then fine-tuned on the source code summarization task for the python code snippets. Further information about the employed model (<b>SEBIS/code_trans_t5_large_source_code_summarization_python_multitask_finetune</b>) such as metrics, data set used, etc. are found in <b><a href='https://huggingface.co/SEBIS/code_trans_t5_large_source_code_summarization_python_multitask_finetune' target='_blank'>this link</a></b> and the original GitHub repository is found in <b><a href='https://github.com/agemagician/CodeTrans' target='_blank'>this link</a></b>.</p>''',
             theme='dark-peach',
             layout='horizontal',
             examples=call_examples(),
             allow_flagging='never').launch(inbrowser=True)