基于ItemCF的协同过滤算法
实训需要实现一个推荐系统,所以先实现了一个比较简单的协同过滤算法
协同过滤的思想来源是充分利用集体智慧,即在大量的人群的行为和数据集中收集答案,以帮助我们对整个人群得到统计意义上的结论:
协同过滤算法有两个基本出发点:
- 兴趣相近的用户可能会对同样的东西感兴趣;
- 用户可能较偏爱与其已购买的东西相类似的商品。
基于1的就是UserCF
基于2的就是ItemCF
这两个算法各有千秋,但是它们都需要离线计算,一般以天为单位更新
众所周知,用户可能产生新的行为,浏览新的物品,
这时候虽然ItemCF的相似度矩阵没有更新,但是仍然可以使用为用户推荐,
所以虽然ItemCF需要离线计算,但是它也具有一定的动态性
(一般来说,用户相似度变化要比物品相似度变化快)
ItemCF的步骤可以大致分为两个部分
- 将物品的用户当作物品的特征向量,然后物品产品之间的相似度,得到物品相似度矩阵
- 从用户已经产生行为的物品中找到于其相似的K个物品(从相似度矩阵中),两次加权累和(用户产生行为物品的评分*相似物品相似度),找出评分最高的N件物品推荐给用户
然而具体实现可以参考以下代码:
import random
import math
from operator import itemgetter
class ItemBasedCF:
# 初始化参数
def __init__(self):
# 找到相似的20个菜谱,为目标用户推荐10个菜谱
# K值:找到和已经看过菜谱最相似的20个菜谱
self.n_sim_cookbook = 20
# N值: 将其中前10名推荐给用户
self.n_rec_cookbook = 10
# 将数据集划分为训练集和测试集
self.trainSet = {}
self.testSet = {}
# 用户相似度矩阵
self.cookbook_sim_matrix = {}
self.cookbook_popular = {}
self.cookbook_count = 0
# print('Similar cookbook number = %d' % self.n_sim_cookbook)
# print('Recommended cookbook number = %d' % self.n_rec_cookbook)
# 读文件得到“用户-菜谱”数据(基于比例划分数据)
def get_dataset(self, filename, pivot=0.75):
trainset_len = 0
testset_len = 0
for line in self.load_file(filename):
user, cookbook, rating, timestamp = line.split(',')
if(random.random()<pivot):
self.trainSet.setdefault(user, {})
self.trainSet[user][cookbook] = rating
trainset_len += 1
else:
self.testSet.setdefault(user, {})
self.testSet[user][cookbook] = rating
testset_len += 1
print('Split success!')
print('TrainSet = %s' % trainset_len)
print('TestSet = %s' % testset_len)
# 读文件,返回文件的每一行
def load_file(self, filename):
with open(filename, 'r') as f:
for i, line in enumerate(f):
if i == 0: # 去掉文件第一行的title
continue
yield line.strip('\r\n')
print('Load %s success!' % filename)
# 计算菜谱之间的相似度
def calc_cookbook_sim(self):
for user, cookbooks in self.trainSet.items():
for cookbook in cookbooks:
if cookbook not in self.cookbook_popular:
self.cookbook_popular[cookbook] = 0
self.cookbook_popular[cookbook] += 1
self.cookbook_count = len(self.cookbook_popular)
print("Total cookbook number = %d" % self.cookbook_count)
for user, cookbooks in self.trainSet.items():
for m1 in cookbooks:
for m2 in cookbooks:
if m1 == m2:
continue
self.cookbook_sim_matrix.setdefault(m1, {})
self.cookbook_sim_matrix[m1].setdefault(m2, 0)
# 朴素计数
#weight = 1
# 根据用户活跃度进行加权(item-IUF)
weight = 1/math.log2(1+len(cookbooks))
self.cookbook_sim_matrix[m1][m2] += weight
print("Build co-rated users matrix success!")
# 计算菜谱之间的相似性
print("Calculating cookbook similarity matrix ...")
for m1, related_cookbooks in self.cookbook_sim_matrix.items():
mx = 0 # wix中最大的值
for m2, count in related_cookbooks.items():
# 注意0向量的处理,即某菜谱的用户数为0
if self.cookbook_popular[m1] == 0 or self.cookbook_popular[m2] == 0:
self.cookbook_sim_matrix[m1][m2] = 0
else:
# 余弦相似度
self.cookbook_sim_matrix[m1][m2] = count / math.sqrt(self.cookbook_popular[m1] * self.cookbook_popular[m2])
# 更新最大值
mx = max(self.cookbook_sim_matrix[m1][m2],mx)
# 进行相似度归一化(Item-Norm)
for m2, count in related_cookbooks.items():
self.cookbook_sim_matrix[m1][m2] /= mx
print('Calculate cookbook similarity matrix success!')
# 针对目标用户U,对其看过的每部菜谱找到K部相似的菜谱,并推荐其N部菜谱
def recommend(self, user):
K = self.n_sim_cookbook
N = self.n_rec_cookbook
rank = {}
watched_cookbooks = self.trainSet[user]
for cookbook, rating in watched_cookbooks.items():
# 得到与看过菜谱最相似的K个菜谱
for related_cookbook, w in sorted(self.cookbook_sim_matrix[cookbook].items(), key=itemgetter(1), reverse=True)[:K]:
# 去掉已经看过的菜谱
if related_cookbook in watched_cookbooks:
continue
rank.setdefault(related_cookbook, 0)
# 排名的依据——>推荐菜谱与该已看菜谱的相似度(累计)*用户对已看菜谱的评分
rank[related_cookbook] += w * float(rating)
return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N]
# 产生推荐并通过准确率、召回率、覆盖率和F-Measure指数进行评估
def evaluate(self):
print('Evaluating start ...')
N = self.n_rec_cookbook
# 准确率和召回率
hit = 0
rec_count = 0
test_count = 0
# 覆盖率
all_rec_cookbooks = set()
for i, user in enumerate(self.trainSet):
test_moives = self.testSet.get(user, {})
rec_cookbooks = self.recommend(user)
for cookbook, w in rec_cookbooks:
if cookbook in test_moives:
hit += 1
all_rec_cookbooks.add(cookbook)
rec_count += N
test_count += len(test_moives)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_cookbooks) / (1.0 * self.cookbook_count)
# F1测量
alpha = 1
fmeasure = ((alpha * alpha + 1) * precision * recall) / (alpha * alpha * (precision + recall))
print('precisioin=%.4f\trecall=%.4f\ncoverage=%.4f\tF-Measure=%.4f\n' % (precision, recall, coverage, fmeasure))
if __name__ == '__main__':
rating_file = 'ml-latest-small/ratings.csv'
# 初始化
itemCF = ItemBasedCF()
# 划分数据集
itemCF.get_dataset(rating_file)
# 计算菜谱相似度
itemCF.calc_cookbook_sim()