原理: P(Ci|W)
= P(w1w2…wn|C)P(Ci) / P(W)
Ci为类别
创建测试数据集
def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0,1,0,1,0,1] #1 代表侮辱性文字, 0 代表正常言论
return postingList,classVec
创建词典,后续步骤转为词向量时作为位置对照
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return list(vocabSet)
对照着字典将数据集转为词向量
词袋模型 或者 词集模型 或者其他
def setOfWord2vec(vocabList,inputSet):
returnVec = [0] *len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else :print "the word:%s is not in my vocabuList"%word
return returnVec
计算P(Ci)及 P(W|Ci)
P(W|Ci) = P(W1|Ci)P(W2|Ci)…..P(Wn|Ci)
如果某个P(Wk|Ci)为0 则P(W|Ci)为0 为了消除这种影响 基准个数设为1
def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbuse = sum(trainCategory)/float(numTrainDocs)
P0Num = np.ones(numWords)
p1Num = np.Ones(numwords)
P0Denom = 1.0
P1Denom = 1.0
for i in range(numTrainDocs)
if trainCategory[i] == 1:
P1Denom += sum(trainMatrix[i])
p1Num += trainMatrix[i]
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
p1Vect = np.log(p1Num/p1Denom) #取对数解决数太小相乘四舍五入为0的下溢问题
p0Vect = np.log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify * p1Vec) + math.log(pClass1)
p0 = sum(vec2Classify * p0Vec) + math.log(1-pClass1)
if p1>p0:
return 1
else:
return 0
封装下函数进行测试
def testingNB():
listOfPosts,listClasses = loadDataSet()
myVocabList = createVocablist(listOfPosts)
trainMat = []
for postinDoc in listOfPosts:
trainMat.append(setOfWord2Vec(myVocabList,postinDoc))
p0V,p1V,pAb = trainNB0(trainMat,listClasses)
testEntry = np.array(['love','my','dalmation'])
thisDoc = setOfWOrd2Vec(myVocabList,testEntry)
print testEntry ,'classified as :',classifyNB(thisDoc,p0V,p1V,pAb)