参考了Yehuda Koren 08年的论文Factorization Meets the Neighborhood: a Multifaceted Collaborative Filtering Model
代码如下:
# -*- coding: UTF-8 -*- import random import math import cPickle as pickle class SVD(): def __init__(self,allfile,trainfile,testfile,factorNum=10): #all data file self.allfile=allfile #training set file self.trainfile=trainfile #testing set file self.testfile=testfile #get factor number self.factorNum=factorNum #get user number self.userNum=self.getUserNum() #get item number self.itemNum=self.getItemNum() #learning rate self.learningRate=0.01 #the regularization lamba self.regularization=0.05 #initialize the model and parameters self.initModel() #get user number function def getUserNum(self): file=self.allfile cnt=0 userSet=set() for line in open(file): user=line.split('\t')[0].strip() if user not in userSet: userSet.add(user) cnt+=1 return cnt #get item number function def getItemNum(self): file=self.allfile cnt=0 itemSet=set() for line in open(file): item=line.split('\t')[1].strip() if item not in itemSet: itemSet.add(item) cnt+=1 return cnt #initialize all parameters def initModel(self): self.av=self.average(self.trainfile) self.bu=[0.0 for i in range(self.userNum)] self.bi=[0.0 for i in range(self.itemNum)] temp=math.sqrt(self.factorNum) self.pu=[[(0.1*random.random()/temp) for i in range(self.factorNum)] for j in range(self.userNum)] self.qi=[[0.1*random.random()/temp for i in range(self.factorNum)] for j in range(self.itemNum)] print "Initialize end.The user number is:%d,item number is:%d,the average score is:%f" %(self.userNum,self.itemNum,self.av) #train model def train(self,iterTimes=100): print "Beginning to train the model......" trainfile=self.trainfile preRmse=10000.0 for iter in range(iterTimes): fi=open(trainfile,'r') #read the training file for line in fi: content=line.split('\t') user=int(content[0].strip())-1 item=int(content[1].strip())-1 rating=float(content[2].strip()) #calculate the predict score pscore=self.predictScore(self.av,self.bu[user],self.bi[item],self.pu[user],self.qi[item]) #the delta between the real score and the predict score eui=rating-pscore #update parameter bu and bi(user rating bias and item rating bias) self.bu[user]+=self.learningRate*(eui-self.regularization*self.bu[user]) self.bi[item]+=self.learningRate*(eui-self.regularization*self.bi[item]) for k in range(self.factorNum): temp=self.pu[user][k] #update pu,qi self.pu[user][k]+=self.learningRate*(eui*self.qi[item][k]-self.regularization*self.pu[user][k]) self.qi[item][k]+=self.learningRate*(eui*temp-self.regularization*self.qi[item][k]) #print pscore,eui #close the file fi.close() #calculate the current rmse curRmse=self.test(self.av,self.bu,self.bi,self.pu,self.qi)[0] curMae=self.test(self.av,self.bu,self.bi,self.pu,self.qi)[1] print "Iteration %d times,RMSE is : %f,MAE is : %f" %(iter+1,curRmse,curMae) if curRmse>preRmse: break else: preRmse=curRmse print "Iteration finished!" #test on the test set and calculate the RMSE def test(self,av,bu,bi,pu,qi): testfile=self.testfile rmse=0.0 mae=0.0 cnt=0 fi=open(testfile) for line in fi: cnt+=1 content=line.split('\t') user=int(content[0].strip())-1 item=int(content[1].strip())-1 score=float(content[2].strip()) pscore=self.predictScore(av,bu[user],bi[item],pu[user],qi[item]) rmse+=math.pow(score-pscore,2) mae+=abs(score-pscore) fi.close() return math.sqrt(rmse/cnt),mae/cnt #calculate the average rating in the training set def average(self,filename): result=0.0 cnt=0 for line in open(filename): cnt+=1 score=float(line.split('\t')[2].strip()) result+=score return result/cnt #calculate the inner product of two vectors def innerProduct(self,v1,v2): result=0.0 for i in range(len(v1)): result+=v1[i]*v2[i] return result def predictScore(self,av,bu,bi,pu,qi): pscore=av+bu+bi+self.innerProduct(pu,qi) if pscore<1: pscore=1 if pscore>5: pscore=5 return pscore if __name__=='__main__': s=SVD("data\\u.data","data\\ua.base","data\\ua.test") #print s.userNum,s.itemNum #print s.average("data\\ua.base") s.train()
实验结果如下:
Initialize end.The user number is:943,item number is:1682,the average score is:3.523827
Beginning to train the model……
Iteration 1 times,RMSE is : 1.002799,MAE is : 0.807791
Iteration 2 times,RMSE is : 0.982096,MAE is : 0.783726
Iteration 3 times,RMSE is : 0.972882,MAE is : 0.774163
Iteration 4 times,RMSE is : 0.967721,MAE is : 0.769057
Iteration 5 times,RMSE is : 0.964556,MAE is : 0.765856
Iteration 6 times,RMSE is : 0.962501,MAE is : 0.763699
Iteration 7 times,RMSE is : 0.961121,MAE is : 0.762131
Iteration 8 times,RMSE is : 0.960174,MAE is : 0.760974
Iteration 9 times,RMSE is : 0.959496,MAE is : 0.760075
Iteration 10 times,RMSE is : 0.958957,MAE is : 0.759327
Iteration 11 times,RMSE is : 0.958456,MAE is : 0.758648
Iteration 12 times,RMSE is : 0.957879,MAE is : 0.757935
Iteration 13 times,RMSE is : 0.957088,MAE is : 0.757073
Iteration 14 times,RMSE is : 0.955944,MAE is : 0.755947
Iteration 15 times,RMSE is : 0.954353,MAE is : 0.754484
Iteration 16 times,RMSE is : 0.952345,MAE is : 0.752735
Iteration 17 times,RMSE is : 0.950108,MAE is : 0.750828
Iteration 18 times,RMSE is : 0.947883,MAE is : 0.748934
Iteration 19 times,RMSE is : 0.945824,MAE is : 0.747158
Iteration 20 times,RMSE is : 0.943971,MAE is : 0.745539
Iteration 21 times,RMSE is : 0.942294,MAE is : 0.744083
Iteration 22 times,RMSE is : 0.940736,MAE is : 0.742716
Iteration 23 times,RMSE is : 0.939252,MAE is : 0.741392
Iteration 24 times,RMSE is : 0.937806,MAE is : 0.740099
Iteration 25 times,RMSE is : 0.936384,MAE is : 0.738842
Iteration 26 times,RMSE is : 0.934983,MAE is : 0.737613
Iteration 27 times,RMSE is : 0.933612,MAE is : 0.736412
Iteration 28 times,RMSE is : 0.932294,MAE is : 0.735245
Iteration 29 times,RMSE is : 0.931057,MAE is : 0.734105
Iteration 30 times,RMSE is : 0.929926,MAE is : 0.733049
Iteration 31 times,RMSE is : 0.928929,MAE is : 0.732111
Iteration 32 times,RMSE is : 0.928082,MAE is : 0.731301
Iteration 33 times,RMSE is : 0.927391,MAE is : 0.730614
Iteration 34 times,RMSE is : 0.926858,MAE is : 0.730037
Iteration 35 times,RMSE is : 0.926480,MAE is : 0.729576
Iteration 36 times,RMSE is : 0.926256,MAE is : 0.729231
Iteration 37 times,RMSE is : 0.926174,MAE is : 0.728978
Iteration 38 times,RMSE is : 0.926232,MAE is : 0.728840
Iteration finished!
最后的RMSE是0.926左右,MAE是0.728左右,可见效果一般,后面会尝试改进。