itemCF推荐算法的实现

基于ItemCF的协同过滤算法

实训需要实现一个推荐系统,所以先实现了一个比较简单的协同过滤算法

协同过滤的思想来源是充分利用集体智慧,即在大量的人群的行为和数据集中收集答案,以帮助我们对整个人群得到统计意义上的结论:
协同过滤算法有两个基本出发点:

  1. 兴趣相近的用户可能会对同样的东西感兴趣;
  2. 用户可能较偏爱与其已购买的东西相类似的商品。

基于1的就是UserCF
基于2的就是ItemCF

这两个算法各有千秋,但是它们都需要离线计算,一般以天为单位更新
众所周知,用户可能产生新的行为,浏览新的物品,
这时候虽然ItemCF的相似度矩阵没有更新,但是仍然可以使用为用户推荐,
所以虽然ItemCF需要离线计算,但是它也具有一定的动态性
(一般来说,用户相似度变化要比物品相似度变化快)

ItemCF的步骤可以大致分为两个部分

  1. 将物品的用户当作物品的特征向量,然后物品产品之间的相似度,得到物品相似度矩阵
  2. 从用户已经产生行为的物品中找到于其相似的K个物品(从相似度矩阵中),两次加权累和(用户产生行为物品的评分*相似物品相似度),找出评分最高的N件物品推荐给用户

然而具体实现可以参考以下代码:

import random
import math
from operator import itemgetter
class ItemBasedCF:
    # 初始化参数
    def __init__(self):
        # 找到相似的20个菜谱,为目标用户推荐10个菜谱
        # K值:找到和已经看过菜谱最相似的20个菜谱
        self.n_sim_cookbook = 20
        # N值: 将其中前10名推荐给用户
        self.n_rec_cookbook = 10

        # 将数据集划分为训练集和测试集
        self.trainSet = {}
        self.testSet = {}

        # 用户相似度矩阵
        self.cookbook_sim_matrix = {}
        self.cookbook_popular = {}
        self.cookbook_count = 0

        # print('Similar cookbook number = %d' % self.n_sim_cookbook)
        # print('Recommended cookbook number = %d' % self.n_rec_cookbook)

    # 读文件得到“用户-菜谱”数据(基于比例划分数据)
    def get_dataset(self, filename, pivot=0.75):
        trainset_len = 0
        testset_len = 0
        for line in self.load_file(filename):
            user, cookbook, rating, timestamp = line.split(',')
            if(random.random()<pivot):
                self.trainSet.setdefault(user, {})
                self.trainSet[user][cookbook] = rating
                trainset_len += 1
            else:
                self.testSet.setdefault(user, {})
                self.testSet[user][cookbook] = rating
                testset_len += 1
        print('Split success!')
        print('TrainSet = %s' % trainset_len)
        print('TestSet = %s' % testset_len)

    # 读文件,返回文件的每一行
    def load_file(self, filename):
        with open(filename, 'r') as f:
            for i, line in enumerate(f):
                if i == 0:  # 去掉文件第一行的title
                    continue
                yield line.strip('\r\n')
        print('Load %s success!' % filename)

    # 计算菜谱之间的相似度
    def calc_cookbook_sim(self):
        for user, cookbooks in self.trainSet.items():
            for cookbook in cookbooks:
                if cookbook not in self.cookbook_popular:
                    self.cookbook_popular[cookbook] = 0
                self.cookbook_popular[cookbook] += 1

        self.cookbook_count = len(self.cookbook_popular)
        print("Total cookbook number = %d" % self.cookbook_count)

        for user, cookbooks in self.trainSet.items():
            for m1 in cookbooks:
                for m2 in cookbooks:
                    if m1 == m2:
                        continue
                    self.cookbook_sim_matrix.setdefault(m1, {})
                    self.cookbook_sim_matrix[m1].setdefault(m2, 0)
                    # 朴素计数
                    #weight = 1
                    # 根据用户活跃度进行加权(item-IUF)
                    weight = 1/math.log2(1+len(cookbooks))
                    self.cookbook_sim_matrix[m1][m2] += weight
        print("Build co-rated users matrix success!")

        # 计算菜谱之间的相似性
        print("Calculating cookbook similarity matrix ...")
        for m1, related_cookbooks in self.cookbook_sim_matrix.items():
            mx = 0 # wix中最大的值
            for m2, count in related_cookbooks.items():
                # 注意0向量的处理,即某菜谱的用户数为0
                if self.cookbook_popular[m1] == 0 or self.cookbook_popular[m2] == 0:
                    self.cookbook_sim_matrix[m1][m2] = 0
                else:
                    # 余弦相似度
                    self.cookbook_sim_matrix[m1][m2] = count / math.sqrt(self.cookbook_popular[m1] * self.cookbook_popular[m2])
                # 更新最大值
                mx = max(self.cookbook_sim_matrix[m1][m2],mx)
            # 进行相似度归一化(Item-Norm)
            for m2, count in related_cookbooks.items():
                self.cookbook_sim_matrix[m1][m2] /= mx
        print('Calculate cookbook similarity matrix success!')


    # 针对目标用户U,对其看过的每部菜谱找到K部相似的菜谱,并推荐其N部菜谱
    def recommend(self, user):
        K = self.n_sim_cookbook
        N = self.n_rec_cookbook
        rank = {}
        watched_cookbooks = self.trainSet[user]

        for cookbook, rating in watched_cookbooks.items():
            # 得到与看过菜谱最相似的K个菜谱
            for related_cookbook, w in sorted(self.cookbook_sim_matrix[cookbook].items(), key=itemgetter(1), reverse=True)[:K]:
                # 去掉已经看过的菜谱
                if related_cookbook in watched_cookbooks:
                    continue
                rank.setdefault(related_cookbook, 0)
                # 排名的依据——>推荐菜谱与该已看菜谱的相似度(累计)*用户对已看菜谱的评分
                rank[related_cookbook] += w * float(rating)
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N]

    # 产生推荐并通过准确率、召回率、覆盖率和F-Measure指数进行评估
    def evaluate(self):
        print('Evaluating start ...')
        N = self.n_rec_cookbook
        # 准确率和召回率
        hit = 0
        rec_count = 0
        test_count = 0
        # 覆盖率
        all_rec_cookbooks = set()

        for i, user in enumerate(self.trainSet):
            test_moives = self.testSet.get(user, {})
            rec_cookbooks = self.recommend(user)
            for cookbook, w in rec_cookbooks:
                if cookbook in test_moives:
                    hit += 1
                all_rec_cookbooks.add(cookbook)
            rec_count += N
            test_count += len(test_moives)

        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_cookbooks) / (1.0 * self.cookbook_count)
        # F1测量
        alpha = 1
        fmeasure = ((alpha * alpha + 1) * precision * recall) / (alpha * alpha * (precision + recall))
        print('precisioin=%.4f\trecall=%.4f\ncoverage=%.4f\tF-Measure=%.4f\n' % (precision, recall, coverage, fmeasure))


if __name__ == '__main__':
    rating_file = 'ml-latest-small/ratings.csv'
    # 初始化
    itemCF = ItemBasedCF()
    # 划分数据集
    itemCF.get_dataset(rating_file)
    # 计算菜谱相似度
    itemCF.calc_cookbook_sim()
    原文作者:Output20
    原文地址: https://blog.csdn.net/skyjudger/article/details/80184164
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞