|
#include "TargetCorpus.h" |
|
|
|
#include <fstream> |
|
#include <string> |
|
#include <cstdlib> |
|
#include <cstring> |
|
|
|
namespace |
|
{ |
|
|
|
const int LINE_MAX_LENGTH = 10000; |
|
|
|
} |
|
|
|
using namespace std; |
|
|
|
TargetCorpus::TargetCorpus() |
|
: m_array(NULL), |
|
m_sentenceEnd(NULL), |
|
m_vcb(), |
|
m_size(0), |
|
m_sentenceCount(0) {} |
|
|
|
TargetCorpus::~TargetCorpus() |
|
{ |
|
free(m_array); |
|
free(m_sentenceEnd); |
|
} |
|
|
|
void TargetCorpus::Create(const string& fileName ) |
|
{ |
|
ifstream textFile; |
|
char line[LINE_MAX_LENGTH]; |
|
|
|
|
|
textFile.open(fileName.c_str()); |
|
|
|
if (!textFile) { |
|
cerr << "no such file or directory " << fileName << endl; |
|
exit(1); |
|
} |
|
|
|
istream *fileP = &textFile; |
|
m_size = 0; |
|
m_sentenceCount = 0; |
|
while(!fileP->eof()) { |
|
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); |
|
if (fileP->eof()) break; |
|
vector< WORD_ID > words = m_vcb.Tokenize( line ); |
|
m_size += words.size(); |
|
m_sentenceCount++; |
|
} |
|
textFile.close(); |
|
cerr << m_size << " words" << endl; |
|
|
|
|
|
m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size ); |
|
m_sentenceEnd = (INDEX*) calloc( sizeof( INDEX ), m_sentenceCount ); |
|
|
|
if (m_array == NULL) { |
|
cerr << "cannot allocate memory to m_array" << endl; |
|
exit(1); |
|
} |
|
|
|
if (m_sentenceEnd == NULL) { |
|
cerr << "cannot allocate memory to m_sentenceEnd" << endl; |
|
exit(1); |
|
} |
|
|
|
|
|
int wordIndex = 0; |
|
int sentenceId = 0; |
|
textFile.open(fileName.c_str()); |
|
|
|
if (!textFile) { |
|
cerr << "no such file or directory " << fileName << endl; |
|
exit(1); |
|
} |
|
|
|
fileP = &textFile; |
|
while(!fileP->eof()) { |
|
SAFE_GETLINE((*fileP), line, LINE_MAX_LENGTH, '\n'); |
|
if (fileP->eof()) break; |
|
vector< WORD_ID > words = m_vcb.Tokenize( line ); |
|
vector< WORD_ID >::const_iterator i; |
|
|
|
for( i=words.begin(); i!=words.end(); i++) { |
|
m_array[ wordIndex++ ] = *i; |
|
} |
|
m_sentenceEnd[ sentenceId++ ] = wordIndex-1; |
|
} |
|
textFile.close(); |
|
cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl; |
|
} |
|
|
|
WORD TargetCorpus::GetWordFromId( const WORD_ID id ) const |
|
{ |
|
return m_vcb.GetWord( id ); |
|
} |
|
|
|
WORD TargetCorpus::GetWord( INDEX sentence, int word ) const |
|
{ |
|
return m_vcb.GetWord( GetWordId( sentence, word ) ); |
|
} |
|
|
|
WORD_ID TargetCorpus::GetWordId( INDEX sentence, int word ) const |
|
{ |
|
if (sentence == 0) { |
|
return m_array[ word ]; |
|
} |
|
return m_array[ m_sentenceEnd[ sentence-1 ] + 1 + word ] ; |
|
} |
|
|
|
char TargetCorpus::GetSentenceLength( INDEX sentence ) const |
|
{ |
|
if (sentence == 0) { |
|
return (char) m_sentenceEnd[ 0 ]+1; |
|
} |
|
return (char) ( m_sentenceEnd[ sentence ] - m_sentenceEnd[ sentence-1 ] ); |
|
} |
|
|
|
void TargetCorpus::Save(const string& fileName ) const |
|
{ |
|
FILE *pFile = fopen ( (fileName + ".tgt").c_str() , "w" ); |
|
if (pFile == NULL) { |
|
cerr << "Cannot open " << fileName << endl; |
|
exit(1); |
|
} |
|
|
|
fwrite( &m_size, sizeof(INDEX), 1, pFile ); |
|
fwrite( m_array, sizeof(WORD_ID), m_size, pFile ); |
|
|
|
fwrite( &m_sentenceCount, sizeof(INDEX), 1, pFile ); |
|
fwrite( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); |
|
fclose( pFile ); |
|
|
|
m_vcb.Save( fileName + ".tgt-vcb" ); |
|
} |
|
|
|
void TargetCorpus::Load(const string& fileName ) |
|
{ |
|
FILE *pFile = fopen ( (fileName + ".tgt").c_str() , "r" ); |
|
if (pFile == NULL) { |
|
cerr << "Cannot open " << fileName << endl; |
|
exit(1); |
|
} |
|
|
|
cerr << "loading from " << fileName << ".tgt" << endl; |
|
|
|
fread( &m_size, sizeof(INDEX), 1, pFile ); |
|
cerr << "words in corpus: " << m_size << endl; |
|
m_array = (WORD_ID*) calloc( sizeof(WORD_ID), m_size ); |
|
|
|
if (m_array == NULL) { |
|
cerr << "cannot allocate memory to m_array" << endl; |
|
exit(1); |
|
} |
|
|
|
fread( m_array, sizeof(WORD_ID), m_size, pFile ); |
|
|
|
fread( &m_sentenceCount, sizeof(INDEX), 1, pFile ); |
|
cerr << "sentences in corpus: " << m_sentenceCount << endl; |
|
m_sentenceEnd = (INDEX*) calloc( sizeof(INDEX), m_sentenceCount ); |
|
|
|
if (m_sentenceEnd == NULL) { |
|
cerr << "cannot allocate memory to m_sentenceEnd" << endl; |
|
exit(1); |
|
} |
|
|
|
fread( m_sentenceEnd, sizeof(INDEX), m_sentenceCount, pFile); |
|
fclose( pFile ); |
|
m_vcb.Load( fileName + ".tgt-vcb" ); |
|
} |
|
|