forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Categories.h
238 lines (198 loc) · 5.97 KB
/
Categories.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
//
// Gigablast, Copyright March 2005
// Author: Javier Olivares <[email protected]>
//
// Stores Categories in a Hierarchy
// Based on DMOZ
//
#ifndef _CATEGORY_H_
#define _CATEGORY_H_
#include "Mem.h"
#include "HashTable.h"
#define RDFBUFFER_SIZE (1024*1024*100)
#define RDFSMALLBUFFER_SIZE (32*1024)
#define RDFSTRUCTURE_FILE "structure.rdf.u8"
#define RDFCONTENT_FILE "content.rdf.u8"
#define STRUCTURE_OUTPUT_FILE "gbdmoz.structure.dat"
#define CONTENT_OUTPUT_FILE "gbdmoz.content.dat"
#define URL_OUTPUT_FILE "gbdmoz.urls.dat"
#define URLTEXT_OUTPUT_FILE "gbdmoz.urls.txt"
#define MAX_CATID_LEN 63
#define MAX_TAG_LEN 127
#define MAX_URL_CATIDS 64
#define MAX_URLTXT_SIZE 500000
#define MAX_CATIDS 64
#define MAX_CATNAME_LEN 1024
#define HASHTABLE_SIZE (1024*1024)
#define URLHASHTABLE_SIZE (10*1024*1024)
#define MAX_SUB_CATS 1024
#define SUBCAT_LETTERBAR 10
#define SUBCAT_NARROW2 30
#define SUBCAT_SYMBOLIC2 31
#define SUBCAT_NARROW1 50
#define SUBCAT_SYMBOLIC1 51
#define SUBCAT_NARROW 70
#define SUBCAT_SYMBOLIC 71
#define SUBCAT_RELATED 90
#define SUBCAT_ALTLANG 110
struct Category {
long m_catid;
long m_parentid;
//short m_numSymParents;
//long m_symParentsOffset;
long m_nameOffset;
short m_nameLen;
unsigned long m_structureOffset;
unsigned long m_contentOffset;
long m_numUrls;
};
struct CategoryHash {
unsigned long m_hash;
long m_catIndex;
};
struct SubCategory {
long m_prefixOffset;
long m_prefixLen;
long m_nameOffset;
long m_nameLen;
char m_type;
};
class Categories {
public:
Categories();
~Categories();
long fileRead ( int fileid, void *buf, size_t count );
void reset();
// load the hierarchy from a file
long loadCategories ( char *filename );
// . this is called by loadCategories() and constructs m_adultTable
// . it will load/save it from/to disk, too
bool makeBadHashTable ( ) ;
bool addUrlsToBadHashTable ( long catid ) ;
// get the index of a cat from its id
// -1 if not found
long getIndexFromId ( long catid );
long getIndexFromPath ( char *str, long strLen );
long getIdFromPath ( char *str, long strLen );
// determine if a category should be printed RTL
bool isIdRTLStart ( long catid );
bool isIndexRTLStart ( long catIndex );
bool isIdRTL ( long catid );
bool isIndexRTL ( long catIndex );
// see if the category is Adult
bool isIdAdultStart ( long catid );
bool isIndexAdultStart ( long catIndex );
bool isIdAdult ( long catid );
bool isIndexAdult ( long catIndex );
// is it in a bad cat, like adult, gambling, online pharmacies
bool isIdBadStart ( long catid );
bool isIndexBadStart ( long catIndex );
bool isIdBad ( long catid );
bool isIndexBad ( long catIndex );
// is this url directly in a dmoz adult category?
bool isInBadCat ( Url *u ) ;
bool isInBadCat ( unsigned long urlHash );
// print info of cats
void printCats ( long start, long end );
// print the path of this category
void printPathFromId ( SafeBuf *sb ,
long catid,
bool raw = false,
bool isRTL = false );
void printPathFromIndex ( SafeBuf *sb ,
long catIndex,
bool raw = false,
bool isRTL = false );
// print the path bread crumb links for this category
void printPathCrumbFromId ( SafeBuf *sb ,
long catid,
bool isRTL = false );
void printPathCrumbFromIndex ( SafeBuf *sb ,
long catid,
bool isRTL = false );
// . get the title and summary for a specific url
// and catid
bool getTitleAndSummary ( char *url,
long urlLen,
long catid,
char *title = NULL,
long *titleLen = NULL,
long maxTitleLen = 0,
char *summ = NULL,
long *summLen = NULL,
long maxSummLen = 0,
char *anchor = NULL,
unsigned char *anchorLen = NULL,
long maxAnchorLen = 0 ,
long niceness = 0 ,
bool justAddToTable = false );
// normalize a url string
long fixUrl ( char *url, long urlLen );
// generate sub categories for a given catid
long generateSubCats ( long catid,
SubCategory *subCats,
char **catBuffer,
long *catBufferSize,
long *catBufferLen,
bool allowRealloc = true );
long getNumUrlsFromIndex ( long catIndex ) {
return m_cats[catIndex].m_numUrls; };
// creates a directory search request url
//void createDirectorySearchUrl ( Url *url,
long createDirSearchRequest ( char *requestBuf,
long requestBufSize,
long catid,
char *hostname,
long hostnameLen,
char *coll,
long collLen,
char *cgi ,//= NULL,
long cgiLen ,//= 0,
bool cgiFromRequest ,//= false ,
class HttpRequest *r );
bool initLangTables(void);
bool loadLangTables(void);
uint8_t findLanguage(char *addr);
// Categories
Category *m_cats;
long m_numCats;
// name buffer
char *m_nameBuffer;
long m_nameBufferSize;
// symbolic parent buffer
//long *m_symParents;
//long m_numSymParents;
// hash buffer
CategoryHash *m_catHash;
// full buffer
char *m_buffer;
long m_bufferSize;
protected:
// for parsing the original dmoz files
char* incRdfPtr ( long skip = 1 );
long rdfParse ( char *tagName );
long rdfNextTag ( );
long fillNextString ( char *str, long max );
long fillNextTagBody ( char *str, long max );
// rdf stream
char *m_rdfPtr;
char *m_rdfEnd;
//std::ifstream m_rdfStream;
int m_rdfStream;
char *m_rdfBuffer;
long m_rdfBufferSize;
long m_currOffset;
// static rdf buffer
char m_rdfSmallBuffer[RDFSMALLBUFFER_SIZE];
// tag buffer
char m_tagRecfer[MAX_TAG_LEN+1];
long m_tagLen;
HashTable m_badTable;
// sub category buffer
//SubCategory m_subCats[MAX_SUB_CATS];
//long m_numSubCats;
};
extern class Categories g_categories1;
extern class Categories g_categories2;
extern class Categories *g_categories;
#endif