-
Notifications
You must be signed in to change notification settings - Fork 0
/
tweetscleaner.h
35 lines (25 loc) · 1.16 KB
/
tweetscleaner.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#ifndef TWEETSCLEANER_H
#define TWEETSCLEANER_H
#include <unordered_set>
#include <string>
#include "delafdict.h"
#include "spellchecker.h"
namespace casimiro {
typedef std::unordered_set<std::string> StringUnorderedSet;
typedef std::vector<StringUnorderedSet> StringUnorderedSets;
class TweetsCleaner
{
public:
TweetsCleaner(const DelafDict& _delafDict, const StringUnorderedSets& _foreignDicts, const SpellChecker& _speller);
virtual ~TweetsCleaner();
virtual void cleanTweets(const std::string& _inFile, const std::string& _outFile, bool _spelling, int _minChoosenWords = 0, double _maxUnknownWordsRate = 0) const;
virtual void cleanTweetsGroupingByUser(const std::string& _inFile, const std::string& _outFile, bool _spelling, int _minChoosenWords = 0, double _maxUnknownWordsRate = 0);
virtual StringVector chooseWords(const StringVector& _words, bool _spelling = false, double* _unknownWordsRateRet = nullptr) const;
private:
const DelafDict& m_delafDict;
const StringUnorderedSets& m_foreignDicts;
const SpellChecker& m_speller;
virtual bool isForeignWord(const std::string& _word) const;
};
}
#endif // TWEETSCLEANER_H