- 步骤
- (1) 找到和目标用户兴趣相似的用户集合。
- (2) 找到这个集合中的用户喜欢的,且目标用户没有听说过的物品推荐给目标用户。
如何计算相似度?
给定用户u和用户v,令N(u)表示用户u曾经有过正反馈的物品集合,令N(v)为用户v曾经有过正反馈的物品集合。余弦相似度计算如下:
wuv=|N(u)∩N(v)||N(u)||N(v)|−−−−−−−−−−√
举例,A,B,C,D,四位用户的购买记录如下:
A{a,b,d} B{a,c} C{b,e} D{b,c,d}
wAB=16√,wAC=16√,wAD=23
相似度代码
def UserSimilarity(train):
W = dict()
for u in train.keys():
for v in train.keys():
if u==v:
continue
W[u][v] = len(train[u] & train[v]);
W[u][v] /= math.sqrt(len( train[u] )* len(train[v]) * 1.0);
return W
但是上述代码的时间复杂度是 O(u*u),现实情况中有很多对用户的
|N(u)∩N(v)| 为0,我们刨除这些计算会节省很多时间,那么换一种思路:
建立物品-用户的倒排表,假定用户u和用户v同时属于倒排表中K个物品对应的用户列表,那么C[u][v]=K, 扫描倒排表中物品的用户列表,对应位置的C[u][v]+1 ,得到所有用户之间不为0的C[u][v]。
def calc_user_sim(self):
''' calculate user similarity matrix '''
# build inverse table for item-users
# key=movieID, value=list of userIDs who have seen this movie
print ('building movie-users inverse table...', file=sys.stderr)
movie2users = dict()
for user, movies in self.trainset.items():
for movie in movies:
# inverse table for item-users
if movie not in movie2users:
movie2users[movie] = set()
movie2users[movie].add(user)
# count item popularity at the same time
if movie not in self.movie_popular:
self.movie_popular[movie] = 0
self.movie_popular[movie] += 1
print ('build movie-users inverse table succ', file=sys.stderr)
# save the total movie number, which will be used in evaluation
self.movie_count = len(movie2users)
print ('total movie number = %d' % self.movie_count, file=sys.stderr)
# count co-rated items between users
usersim_mat = self.user_sim_mat
print ('building user co-rated movies matrix...', file=sys.stderr)
for movie, users in movie2users.items():
for u in users:
for v in users:
if u == v:
continue
usersim_mat.setdefault(u, {})
usersim_mat[u].setdefault(v, 0)
usersim_mat[u][v] += 1
print ('build user co-rated movies matrix succ', file=sys.stderr)
# calculate similarity matrix
print ('calculating user similarity matrix...', file=sys.stderr)
simfactor_count = 0
PRINT_STEP = 2000000
for u, related_users in usersim_mat.items():
for v, count in related_users.items():
usersim_mat[u][v] = count / math.sqrt(
len(self.trainset[u]) * len(self.trainset[v]))
simfactor_count += 1
if simfactor_count % PRINT_STEP == 0:
print ('calculating user similarity factor(%d)' %
simfactor_count, file=sys.stderr)
print ('calculate user similarity matrix(similarity factor) succ',
file=sys.stderr)
print ('Total similarity factor number = %d' %
simfactor_count, file=sys.stderr)
相似度矩阵计算之后,用户u对物品i的感兴趣程度如下:
p(u,i)=∑v∈S(u,k)∩N(i)wuvrvi
S(u, K)包含和用户u兴趣最接近的K个用户, N(i)是对物品i有过行为的用户集合, wuv是用户u和用户v的兴趣相似度
def recommend(self, user):
''' Find K similar users and recommend N movies. '''
K = self.n_sim_user
N = self.n_rec_movie
rank = dict()
watched_movies = self.trainset[user]
for similar_user, similarity_factor in sorted(self.user_sim_mat[user].items(),
key=itemgetter(1), reverse=True)[0:K]:
for movie in self.trainset[similar_user]:
if movie in watched_movies:
continue
# predict the user's "interest" for each movie
rank.setdefault(movie, 0)
rank[movie] += similarity_factor
# return the N best movies
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
完整可运行代码如下
将 数据 中的m1-1m数据下载之后放在同目录下
#-*- coding: utf-8 -*-
''' Created on 2015-06-22 @author: Lockvictor '''
import sys
import random
import math
import os
from operator import itemgetter
random.seed(0)
class UserBasedCF(object):
''' TopN recommendation - User Based Collaborative Filtering '''
def __init__(self):
self.trainset = {}
self.testset = {}
self.n_sim_user = 20
self.n_rec_movie = 10
self.user_sim_mat = {}
self.movie_popular = {}
self.movie_count = 0
print ('Similar user number = %d' % self.n_sim_user, file=sys.stderr)
print ('recommended movie number = %d' %
self.n_rec_movie, file=sys.stderr)
@staticmethod
def loadfile(filename):
''' load a file, return a generator. '''
fp = open(filename, 'r')
for i, line in enumerate(fp):
yield line.strip('\r\n')
if i % 100000 == 0:
print ('loading %s(%s)' % (filename, i), file=sys.stderr)
fp.close()
print ('load %s succ' % filename, file=sys.stderr)
def generate_dataset(self, filename, pivot=0.7):
''' load rating data and split it to training set and test set '''
trainset_len = 0
testset_len = 0
for line in self.loadfile(filename):
user, movie, rating, _ = line.split('::')
# split the data by pivot
if random.random() < pivot:
self.trainset.setdefault(user, {})
self.trainset[user][movie] = int(rating)
trainset_len += 1
else:
self.testset.setdefault(user, {})
self.testset[user][movie] = int(rating)
testset_len += 1
print ('split training set and test set succ', file=sys.stderr)
print ('train set = %s' % trainset_len, file=sys.stderr)
print ('test set = %s' % testset_len, file=sys.stderr)
def calc_user_sim(self):
''' calculate user similarity matrix '''
# build inverse table for item-users
# key=movieID, value=list of userIDs who have seen this movie
print ('building movie-users inverse table...', file=sys.stderr)
movie2users = dict()
for user, movies in self.trainset.items():
for movie in movies:
# inverse table for item-users
if movie not in movie2users:
movie2users[movie] = set()
movie2users[movie].add(user)
# count item popularity at the same time
if movie not in self.movie_popular:
self.movie_popular[movie] = 0
self.movie_popular[movie] += 1
print ('build movie-users inverse table succ', file=sys.stderr)
# save the total movie number, which will be used in evaluation
self.movie_count = len(movie2users)
print ('total movie number = %d' % self.movie_count, file=sys.stderr)
# count co-rated items between users
usersim_mat = self.user_sim_mat
print ('building user co-rated movies matrix...', file=sys.stderr)
for movie, users in movie2users.items():
for u in users:
for v in users:
if u == v:
continue
usersim_mat.setdefault(u, {})
usersim_mat[u].setdefault(v, 0)
usersim_mat[u][v] += 1
print ('build user co-rated movies matrix succ', file=sys.stderr)
# calculate similarity matrix
print ('calculating user similarity matrix...', file=sys.stderr)
simfactor_count = 0
PRINT_STEP = 2000000
for u, related_users in usersim_mat.items():
for v, count in related_users.items():
usersim_mat[u][v] = count / math.sqrt(
len(self.trainset[u]) * len(self.trainset[v]))
simfactor_count += 1
if simfactor_count % PRINT_STEP == 0:
print ('calculating user similarity factor(%d)' %
simfactor_count, file=sys.stderr)
print ('calculate user similarity matrix(similarity factor) succ',
file=sys.stderr)
print ('Total similarity factor number = %d' %
simfactor_count, file=sys.stderr)
def recommend(self, user):
''' Find K similar users and recommend N movies. '''
K = self.n_sim_user
N = self.n_rec_movie
rank = dict()
watched_movies = self.trainset[user]
for similar_user, similarity_factor in sorted(self.user_sim_mat[user].items(),
key=itemgetter(1), reverse=True)[0:K]:
for movie in self.trainset[similar_user]:
if movie in watched_movies:
continue
# predict the user's "interest" for each movie
rank.setdefault(movie, 0)
rank[movie] += similarity_factor
# return the N best movies
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
def evaluate(self):
''' print evaluation result: precision, recall, coverage and popularity '''
print ('Evaluation start...', file=sys.stderr)
N = self.n_rec_movie
# varables for precision and recall
hit = 0
rec_count = 0
test_count = 0
# varables for coverage
all_rec_movies = set()
# varables for popularity
popular_sum = 0
for i, user in enumerate(self.trainset):
if i % 500 == 0:
print ('recommended for %d users' % i, file=sys.stderr)
test_movies = self.testset.get(user, {})
rec_movies = self.recommend(user)
for movie, _ in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
popular_sum += math.log(1 + self.movie_popular[movie])
rec_count += N
test_count += len(test_movies)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
popularity = popular_sum / (1.0 * rec_count)
print ('precision=%.4f\trecall=%.4f\tcoverage=%.4f\tpopularity=%.4f' %
(precision, recall, coverage, popularity), file=sys.stderr)
if __name__ == '__main__':
ratingfile = os.path.join('ml-1m', 'ratings.dat')
usercf = UserBasedCF()
usercf.generate_dataset(ratingfile)
usercf.calc_user_sim()
usercf.evaluate()
运行结果
Similar movie number = 20
Recommended movie number = 10
loading ml-1m\ratings.dat(0)
loading ml-1m\ratings.dat(100000)
loading ml-1m\ratings.dat(200000)
loading ml-1m\ratings.dat(300000)
loading ml-1m\ratings.dat(400000)
loading ml-1m\ratings.dat(500000)
loading ml-1m\ratings.dat(600000)
loading ml-1m\ratings.dat(700000)
loading ml-1m\ratings.dat(800000)
loading ml-1m\ratings.dat(900000)
loading ml-1m\ratings.dat(1000000)
load ml-1m\ratings.dat succ split training set and test set succ train set = 700450 test set = 299759 counting movies number and popularity... count movies number and popularity succ total movie number = 3666 building co-rated users matrix... build co-rated users matrix succ calculating movie similarity matrix... calculating movie similarity factor(2000000) calculating movie similarity factor(4000000) calculating movie similarity factor(6000000) calculating movie similarity factor(8000000) calculating movie similarity factor(10000000) calculate movie similarity matrix(similarity factor) succ Total similarity factor number = 10174814 Evaluation start... recommended for 0 users recommended for 500 users recommended for 1000 users recommended for 1500 users recommended for 2000 users recommended for 2500 users recommended for 3000 users recommended for 3500 users recommended for 4000 users recommended for 4500 users recommended for 5000 users recommended for 5500 users recommended for 6000 users precision=0.3792 recall=0.0764 coverage=0.1729 popularity=7.1730 Process finished with exit code 0