一、源码说明
基于物品的协同过滤算法和基于用户的协同过滤算法类似,以给该物品评分的用户作为物品的特征向量,从而计算物品之间的余弦相似度。以下代码根据点击打开链接修改而来,修改了计算相似度的函数和进行推荐的函数。推荐效果的准确度不到10%,基于用户的准确度在20%。
二、准确度不高的原因分析
从推荐的结果看,根据代码设定是要推荐TOP-10的一个列表,但是结果往往很多只有3,4个,并没有10个。原因是用户u评分了物品i,与物品i大部分相似的物品已经存在于用户u的评分物品中呢,那么就不能再次推荐。
三、python源码
import random as rd , math as mt, operator as op
"""
SplitData(data, M, k, seed) approximately split data into M-1 train data and 1 test data.
This function should be call M times on the conditon that K varies from 0 to M-1
and keep seed constant.
"""
def SplitData(data, M, k, seed):
test = []
train = []
rd.seed(seed)
for user,item in data:
if rd.randint(0,M) == k:#generate a uniform random number in [0,M]
test.append([user,item])
else:
train.append([user,item])
return train, test
"""
the data structure of train data, test data are express a dictionary like {key:set}.
key is user. set is a set of movies that the users have rated.
"""
def list2dic(listdata):
dicdata = dict()
for user,item in listdata:
if user not in dicdata.keys():
dicdata[user] = set()
dicdata[user].add(item)
else:
dicdata[user].add(item)
return dicdata
#
def createItemtrain(train):
itemtrain = dict()
for user, item in train:
if item not in itemtrain.keys():
itemtrain[item] = set()
itemtrain[item].add(user)
else:
itemtrain[item].add(user)
return itemtrain
def ItemSimilarity1(itemtrain):
W = dict()
for i in itemtrain.keys():
W[i]=dict()
for j in itemtrain.keys():
if i==j:
continue
W[i][j] = len(itemtrain[i] & itemtrain[j])
W[i][j] /= mt.sqrt(len(itemtrain[i]) * len(itemtrain[j]) * 1.0)
return W
# top-N list for user u
def Recommand(train, W, K, N):
allrank = dict()
for u in train.keys():
for i in train[u]:
rank = dict()
for j, wij in sorted(W[i].items(), key = op.itemgetter(1), reverse = True)[0:K]:
if j not in train[u]:
if j not in rank.keys():
rank[j] = wij;
else:
rank[j] += wij * 1
ranklist = []
for item, sims in sorted(rank.items(), key = op.itemgetter(1), reverse = True)[0:N]:
ranklist.append(item)
allrank[u] = ranklist
return allrank
# compute precision
def Precision(allrank, test, N):
hit = 0
all = 0
for user in test.keys():
tu = test[user]
if user in allrank.keys():
for item in allrank[user]:
if item in tu:
hit+=1
all += N
return hit / (all*1.0)
"""
main function
"""
filestring = '/home/sysu-hgavin/文档/ml-1m/ratings.dat'
f = open(filestring, 'r')
data = []
#count = 0
while 1:
line = f.readline()#read data
#count += 1
if not line:
break
#if count > 1000:
# break
line = line.split("::")[:2]
line[0] = int (line[0])
line[1] = int (line[1])
data.append(line)
f.close()
M = 8
seed = 2
N = 10 #top-N
K = 10 #the number of neighbors
#for k in range(M-1):
train, test = SplitData(data, M, 1, seed)#generate train data and test data
itemtrain = createItemtrain(train)
train = list2dic(train)
test = list2dic(test)
W = ItemSimilarity1(itemtrain)
allrank = Recommand(train, W, K, N)
precision = Precision(allrank, test, N)
print (precision)