-
Notifications
You must be signed in to change notification settings - Fork 0
/
spam.lisp
112 lines (91 loc) · 3.3 KB
/
spam.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
(in-package :com.gigamonkeys.spam)
(defun classify (text)
(classification (score (extract-features text))))
(defparameter *max-ham-score* .4)
(defparameter *min-spam-score* .6)
(defun classification (score)
(cond
((<= score *max-ham-score*) 'ham)
((>= score *min-spam-score*) 'spam)
(t 'unsure)))
(defclass word-feature ()
((word
:initarg :word
:accessor word
:initform (error "Must supply :word")
:documentation "The word this feature represents.")
(spam-count
:initarg :spam-count
:accessor spam-count
:initform 0
:documentation "Number of spams we have seen this feature in.")
(ham-count
:initarg :ham-count
:accessor ham-count
:initform 0
:documentation "Number of hams we have seen this feature in.")))
(defvar *feature-database* (make-hash-table :test #'equal))
(defvar *total-hams* 0)
(defvar *total-spams* 0)
(defun clear-database ()
(setf *feature-database* (make-hash-table :test #'equal))
(setf *total-hams* 0)
(setf *total-spams* 0))
(defun intern-feature (word)
(or (gethash word *feature-database*)
(setf (gethash word *feature-database*)
(make-instance 'word-feature :word word))))
(defun extract-words (text)
(delete-duplicates
(cl-ppcre:all-matches-as-strings "[a-zA-Z]{3,}" text)
:test #'string=))
(defun extract-features (text)
(mapcar #'intern-feature (extract-words text)))
(defmethod print-object ((object word-feature) stream)
(print-unreadable-object (object stream :type t)
(with-slots (word ham-count spam-count) object
(format stream "~s :hams ~d :spams ~d" word ham-count spam-count))))
(defun train (text type)
(dolist (feature (extract-features text))
(increment-count feature type))
(increment-total-count type))
(defun increment-count (feature type)
(ecase type
(ham (incf (ham-count feature)))
(spam (incf (ham-count feature)))))
(defun increment-total-count (type)
(ecase type
(ham (incf *total-hams*))
(spam (incf *total-spams*))))
(defun spam-probability (feature)
(with-slots (spam-count ham-count) feature
(let ((spam-frequency (/ spam-count (max 1 *total-spams*)))
(ham-frequency (/ ham-count (max 1 *total-hams*))))
(/ spam-frequency (+ spam-frequency ham-frequency)))))
(defun bayesian-spam-probability (feature &optional
(assumed-probability 1/2)
(weight 1))
(let ((basic-probability (spam-probability feature))
(data-points (+ (spam-count feature) (ham-count feature))))
(/ (+ (* weight assumed-probability)
(* data-points basic-probability))
(+ weight data-points))))
(defun score (features)
(let ((spam-probs ()) (ham-probs ()) (number-of-probs 0))
(dolist (feature features)
(unless (untrained-p feature)
(let ((spam-prob (float (bayesian-spam-probability feature) 0.0d0)))
(push spam-prob spam-probs)
(push (- 1.0d0 spam-prob) ham-probs)
(incf number-of-probs))))
(let ((h (- 1 (fisher spam-probs number-of-probs)))
(s (- 1 (fisher ham-probs number-of-probs))))
(/ (+ (- 1 h) s) 2.0d0))))
(defun untrained-p (feature)
(with-slots (spam-count ham-count) feature
(and (zerop spam-count) (zerop ham-count))))
(defun fisher (probs number-of-probs)
"The Fisher computation described by Robinson."
(inverse-chi-square
(* -2 (log (reduce #'* probs)))
(* 2 number-of-probs)))