#include "arpa_io.hh" #include "format.hh" #include "phrase.hh" #ifndef NTHREAD #include "thread.hh" #endif #include "vocab.hh" #include "wrapper.hh" #include "../../util/exception.hh" #include "../../util/file_piece.hh" #include #include #include #include #include namespace lm { namespace { void DisplayHelp(const char *name) { std::cerr << "Usage: " << name << " mode [context] [phrase] [raw|arpa] [threads:m] [batch_size:m] (vocab|model):input_file output_file\n\n" "copy mode just copies, but makes the format nicer for e.g. irstlm's broken\n" " parser.\n" "single mode treats the entire input as a single sentence.\n" "multiple mode filters to multiple sentences in parallel. Each sentence is on\n" " a separate line. A separate file is created for each sentence by appending\n" " the 0-indexed line number to the output file name.\n" "union mode produces one filtered model that is the union of models created by\n" " multiple mode.\n\n" "context means only the context (all but last word) has to pass the filter, but\n" " the entire n-gram is output.\n\n" "phrase means that the vocabulary is actually tab-delimited phrases and that the\n" " phrases can generate the n-gram when assembled in arbitrary order and\n" " clipped. Currently works with multiple or union mode.\n\n" "The file format is set by [raw|arpa] with default arpa:\n" "raw means space-separated tokens, optionally followed by a tab and arbitrary\n" " text. This is useful for ngram count files.\n" "arpa means the ARPA file format for n-gram language models.\n\n" #ifndef NTHREAD "threads:m sets m threads (default: conccurrency detected by boost)\n" "batch_size:m sets the batch size for threading. Expect memory usage from this\n" " of 2*threads*batch_size n-grams.\n\n" #else "This binary was compiled with -DNTHREAD, disabling threading. If you wanted\n" " threading, compile without this flag against Boost >=1.42.0.\n\n" #endif "There are two inputs: vocabulary and model. Either may be given as a file\n" " while the other is on stdin. Specify the type given as a file using\n" " vocab: or model: before the file name. \n\n" "For ARPA format, the output must be seekable. For raw format, it can be a\n" " stream i.e. /dev/stdout\n"; } typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} FilterMode; typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format; struct Config { Config() : #ifndef NTHREAD batch_size(25000), threads(boost::thread::hardware_concurrency()), #endif phrase(false), context(false), format(FORMAT_ARPA) { #ifndef NTHREAD if (!threads) threads = 1; #endif } #ifndef NTHREAD size_t batch_size; size_t threads; #endif bool phrase; bool context; FilterMode mode; Format format; }; template void RunThreadedFilter(const Config &config, util::FilePiece &in_lm, Filter &filter, Output &output) { #ifndef NTHREAD if (config.threads == 1) { #endif Format::RunFilter(in_lm, filter, output); #ifndef NTHREAD } else { typedef Controller Threaded; Threaded threading(config.batch_size, config.threads * 2, config.threads, filter, output); Format::RunFilter(in_lm, threading, output); } #endif } template void RunContextFilter(const Config &config, util::FilePiece &in_lm, Filter filter, Output &output) { if (config.context) { ContextFilter context_filter(filter); RunThreadedFilter, OutputBuffer, Output>(config, in_lm, context_filter, output); } else { RunThreadedFilter(config, in_lm, filter, output); } } template void DispatchBinaryFilter(const Config &config, util::FilePiece &in_lm, const Binary &binary, typename Format::Output &out) { typedef BinaryFilter Filter; RunContextFilter(config, in_lm, Filter(binary), out); } template void DispatchFilterModes(const Config &config, std::istream &in_vocab, util::FilePiece &in_lm, const char *out_name) { if (config.mode == MODE_MULTIPLE) { if (config.phrase) { typedef phrase::Multiple Filter; phrase::Substrings substrings; typename Format::Multiple out(out_name, phrase::ReadMultiple(in_vocab, substrings)); RunContextFilter(config, in_lm, Filter(substrings), out); } else { typedef vocab::Multiple Filter; boost::unordered_map > words; typename Format::Multiple out(out_name, vocab::ReadMultiple(in_vocab, words)); RunContextFilter(config, in_lm, Filter(words), out); } return; } typename Format::Output out(out_name); if (config.mode == MODE_COPY) { Format::Copy(in_lm, out); return; } if (config.mode == MODE_SINGLE) { vocab::Single::Words words; vocab::ReadSingle(in_vocab, words); DispatchBinaryFilter(config, in_lm, vocab::Single(words), out); return; } if (config.mode == MODE_UNION) { if (config.phrase) { phrase::Substrings substrings; phrase::ReadMultiple(in_vocab, substrings); DispatchBinaryFilter(config, in_lm, phrase::Union(substrings), out); } else { vocab::Union::Words words; vocab::ReadMultiple(in_vocab, words); DispatchBinaryFilter(config, in_lm, vocab::Union(words), out); } return; } } } // namespace } // namespace lm int main(int argc, char *argv[]) { try { if (argc < 4) { lm::DisplayHelp(argv[0]); return 1; } // I used to have boost::program_options, but some users didn't want to compile boost. lm::Config config; config.mode = lm::MODE_UNSET; for (int i = 1; i < argc - 2; ++i) { const char *str = argv[i]; if (!std::strcmp(str, "copy")) { config.mode = lm::MODE_COPY; } else if (!std::strcmp(str, "single")) { config.mode = lm::MODE_SINGLE; } else if (!std::strcmp(str, "multiple")) { config.mode = lm::MODE_MULTIPLE; } else if (!std::strcmp(str, "union")) { config.mode = lm::MODE_UNION; } else if (!std::strcmp(str, "phrase")) { config.phrase = true; } else if (!std::strcmp(str, "context")) { config.context = true; } else if (!std::strcmp(str, "arpa")) { config.format = lm::FORMAT_ARPA; } else if (!std::strcmp(str, "raw")) { config.format = lm::FORMAT_COUNT; #ifndef NTHREAD } else if (!std::strncmp(str, "threads:", 8)) { config.threads = boost::lexical_cast(str + 8); if (!config.threads) { std::cerr << "Specify at least one thread." << std::endl; return 1; } } else if (!std::strncmp(str, "batch_size:", 11)) { config.batch_size = boost::lexical_cast(str + 11); if (config.batch_size < 5000) { std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl; if (!config.batch_size) return 1; } #endif } else { lm::DisplayHelp(argv[0]); return 1; } } if (config.mode == lm::MODE_UNSET) { lm::DisplayHelp(argv[0]); return 1; } if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) { std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl; return 1; } bool cmd_is_model = true; const char *cmd_input = argv[argc - 2]; if (!strncmp(cmd_input, "vocab:", 6)) { cmd_is_model = false; cmd_input += 6; } else if (!strncmp(cmd_input, "model:", 6)) { cmd_input += 6; } else if (strchr(cmd_input, ':')) { std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl; return 1; } else { std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl; } std::ifstream cmd_file; std::istream *vocab; if (cmd_is_model) { vocab = &std::cin; } else { cmd_file.open(cmd_input, std::ios::in); UTIL_THROW_IF(!cmd_file, util::ErrnoException, "Failed to open " << cmd_input); vocab = &cmd_file; } util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr); if (config.format == lm::FORMAT_ARPA) { lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); } else if (config.format == lm::FORMAT_COUNT) { lm::DispatchFilterModes(config, *vocab, model, argv[argc - 1]); } return 0; } catch (const std::exception &e) { std::cerr << e.what() << std::endl; return 1; } }