| | #include "Vocabulary.h" |
| |
|
| | #pragma once |
| |
|
| | #define LINE_MAX_LENGTH 10000 |
| |
|
| | namespace tmmt |
| | { |
| |
|
| | class SuffixArray |
| | { |
| | public: |
| | typedef unsigned int INDEX; |
| |
|
| | private: |
| | std::vector< std::vector< WORD_ID > > corpus; |
| |
|
| | WORD_ID *m_array; |
| | INDEX *m_index; |
| | INDEX *m_buffer; |
| | char *m_wordInSentence; |
| | size_t *m_sentence; |
| | char *m_sentenceLength; |
| | WORD_ID m_endOfSentence; |
| | Vocabulary m_vcb; |
| | INDEX m_size; |
| |
|
| | public: |
| | SuffixArray( std::string fileName ); |
| | ~SuffixArray(); |
| |
|
| | void Sort(INDEX start, INDEX end); |
| | int CompareIndex( INDEX a, INDEX b ) const; |
| | inline int CompareWord( WORD_ID a, WORD_ID b ) const; |
| | int Count( const std::vector< WORD > &phrase ); |
| | bool MinCount( const std::vector< WORD > &phrase, INDEX min ); |
| | bool Exists( const std::vector< WORD > &phrase ); |
| | int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 ); |
| | int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 ); |
| | INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end ); |
| | INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction ); |
| | int Match( const std::vector< WORD > &phrase, INDEX index ); |
| | void List( INDEX start, INDEX end ); |
| | inline INDEX GetPosition( INDEX index ) { |
| | return m_index[ index ]; |
| | } |
| | inline size_t GetSentence( INDEX position ) { |
| | return m_sentence[position]; |
| | } |
| | inline char GetWordInSentence( INDEX position ) { |
| | return m_wordInSentence[position]; |
| | } |
| | inline char GetSentenceLength( size_t sentenceId ) { |
| | return m_sentenceLength[sentenceId]; |
| | } |
| | inline INDEX GetSize() { |
| | return m_size; |
| | } |
| |
|
| | Vocabulary &GetVocabulary() { |
| | return m_vcb; |
| | } |
| | const std::vector< std::vector< WORD_ID > > &GetCorpus() const { |
| | return corpus; |
| | } |
| | }; |
| |
|
| | } |
| |
|
| |
|