Spaces:

abhaskumarsinha
/

MinimalGPT-Felis_Catus

Sleeping

App Files Files Community

MinimalGPT-Felis_Catus / subword /get_vocab.py

abhaskumarsinha

Upload 21 files

70d3cae over 1 year ago

raw

history blame contribute delete

No virus

2.76 kB

	#! /usr/bin/env python
	from __future__ import print_function

	import os
	import sys
	import inspect
	import warnings
	import argparse
	import codecs

	from collections import Counter

	# hack for python2/3 compatibility
	from io import open
	argparse.open = open

	def create_parser(subparsers=None):

	if subparsers:
	parser = subparsers.add_parser('get-vocab',
	formatter_class=argparse.RawDescriptionHelpFormatter,
	description="Generates vocabulary")
	else:
	parser = argparse.ArgumentParser(
	formatter_class=argparse.RawDescriptionHelpFormatter,
	description="Generates vocabulary")

	parser.add_argument(
	'--input', '-i', type=argparse.FileType('r'), default=sys.stdin,
	metavar='PATH',
	help="Input file (default: standard input).")

	parser.add_argument(
	'--output', '-o', type=argparse.FileType('w'), default=sys.stdout,
	metavar='PATH',
	help="Output file (default: standard output)")

	return parser

	def get_vocab(train_file, vocab_file):

	c = Counter()

	for line in train_file:
	for word in line.strip('\r\n ').split(' '):
	if word:
	c[word] += 1

	for key,f in sorted(c.items(), key=lambda x: x[1], reverse=True):
	vocab_file.write(key+" "+ str(f) + "\n")

	if __name__ == "__main__":

	currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
	newdir = os.path.join(currentdir, 'subword_nmt')
	if os.path.isdir(newdir):
	warnings.warn(
	"this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir),
	DeprecationWarning
	)

	# python 2/3 compatibility
	if sys.version_info < (3, 0):
	sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
	sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
	sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
	else:
	sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
	sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
	sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)

	parser = create_parser()
	args = parser.parse_args()

	# read/write files as UTF-8
	if args.input.name != '<stdin>':
	args.input = codecs.open(args.input.name, encoding='utf-8')
	if args.output.name != '<stdout>':
	args.output = codecs.open(args.output.name, 'w', encoding='utf-8')

	get_vocab(args.input, args.output)

	# close files
	if args.input.name != '<stdin>':
	args.input.close()
	if args.output.name != '<stdout>':
	args.output.close()