个人感觉,崔老师讲的这门课不怎么好,课上很多错误,但是在上这门课的时候也学会了一些思想、技能(调bug)等。
实现基础规则算法
个性化推荐: 人口属性、地理属性、资产属性、兴趣属性
协同过滤推荐算法:基于邻域的协同过滤(用户UCF、物品ICF)、基于模型的协同过滤
基于内容的推荐算法
混合推荐算法
流行度推荐算法
协同过滤推荐算法
优点:实现快、对商品和用户没有要求、效果有保证(28定律)
缺点:冷启动(可以利用基于内容的信息进行解决)、马太效应(热门商品越热门、冷门商品越冷门)、推荐解释模糊
基于邻域的协同过滤
1.距离度量:欧几里得、曼哈顿、切比雪夫、海明、闽科夫斯基、马哈拉诺比斯
用用户的评分进行计算
2.相似度:余弦相似度、皮尔逊相关系数(介于 1 和 -1 之间的值)、Jaccard相似系数、对数似然相似度、互信息、信息增益、相对熵、KL散度、TF-IDF
基于模型的协同过滤
1.关联算法:Apiori、FPGROWTH(优于Apriori) —— 超市购物、电商网购
学习FPGROWTH:
https://blog.csdn.net/daizongxue/article/details/77504158
https://blog.csdn.net/Bone_ACE/article/details/46669699
https://github.com/SongDark/FPgrowth
2.聚类算法:K-Means、BIRCH
3.分类算法:逻辑回归、朴素贝叶斯
4.回归算法:
5.矩阵分解:
6.神经网络:
7.图模型:
8.隐语意模型:
基础规则推荐算法
四部分:K-means、mini_batch_k-means、Birch、相似度计算
KMeans(n_clusters=8, init=’k-means++’, n_init=10, max_iter=300, tol=0.0001, precompute_distances=’auto’, verbose=0, random_state=None, copy_x=True, n_jobs=1, algorithm=’auto’
Movielens数据集
943个人在1682个电影里打了100000个评分(1-5),每个用户最少20个评分
from sklearn.cluster import KMeans
import numpy as np
X = np.array([[1, 2], [1, 4], [1, 0],[4, 2], [4, 4], [4, 0]])
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
print("标签:" + str(kmeans.labels_))
print("预测标签:" + str(kmeans.predict([[0, 0], [4, 4]])))
print("聚类中心:" + str(kmeans.cluster_centers_))
from sklearn.cluster import KMeans
u1_base = open("./ml-100k/u1.base")
X_array = []
y_array = []
# user id | item id | rating | timestamp
for l in u1_base:
ls = l.strip().split("\t")
y_array.append(int(ls[2]))
X_array.append([int(ls[0]), int(ls[1])])
kmeans = KMeans(n_clusters=5, random_state=0).fit(X_array)
kmeans.predict([[1,6]])
调优方向
1、重新分析数据
2、重新选择算法
3、重新理解数据
评估方法
adjusted_rand_score调整兰德系数
1、对任意数量的聚类中心和样本数,随机聚类的ARI都非常接近于0;
2、取值在[-1,1]之间,负数代表结果不好,越接近于1越好;
3、可用于聚类算法之间的比较。
u1_test = open("./ml-100k/u1.test")
X_test = []
y_test = []
for l in u1_test:
ls = l.strip().split("\t")
X_test.append([int(ls[0]), int(ls[1])])
y_test.append(int(ls[2]))
y_pre = kmeans.predict(X_test)
y_pred = []
for y in y_pre:
y += 1
y_pred.append(y)
from sklearn import metrics
print(metrics.adjusted_rand_score(y_test, y_pred))
from sklearn.cluster import MiniBatchKMeans
mb = MiniBatchKMeans(n_clusters=5, batch_size=200).fit(X_array)
y_mini_pre = mb.predict(X_test)
y_mini_pred = []
for y in y_mini_pre:
y += 1
y_mini_pred.append(y)
from sklearn import metrics
print(metrics.adjusted_rand_score(y_test, y_mini_pred))
from sklearn.cluster import Birch
bch = Birch(threshold=0.5, branching_factor=50, n_clusters=5, compute_labels=True)
bch.fit(X_array[0:1000])
y_bch_pre = bch.predict(X_test[0:1000])
y_bch_pred = []
for y in y_bch_pre:
y += 1
y_bch_pred.append(y)
from sklearn import metrics
print(metrics.adjusted_rand_score(y_test[0:1000], y_bch_pred))
协同过滤
1、基于物品的协同过滤算法ICF
给用户推荐那些和他们以前喜欢的物品相似的物品。
构建用户-物品倒排表
2、基于用户的协同过滤算法UCF
在有相同喜好的用户间进行相似性推荐。
构建物品-用户倒排表
import math
class ItemBasedCF:
def __init__(self, train_file):
""" 初始化对象 """
self.train_file = train_file
self.readData()
def readData(self):
""" 读取文件,并生成用户-物品,测试集 用户-物品的评分表 训练集 """
self.train = {}
# 打开文件,读取训练集
for line in open(self.train_file):
user, item, score, _ = line.strip().split("\t")
self.train.setdefault(user, {})
self.train[user][item] = int(score)
def ItemSimilarity(self):
""" 计算物品之间的相似度 """
C = {}
N = {}
for user, items in self.train.items():
for i in items.keys():
if i not in N.keys():
N.setdefault(i, 0)
N[i] += 1
if i not in C.keys():
C.setdefault(i, {})
for j in items.keys():
if i == j:
continue
if j not in C[i].keys():
C[i].setdefault(j, 0)
C[i][j] += 1
self.W = {}
for i, related_items in C.items():
if i not in self.W.keys():
self.W.setdefault(i, {})
for j, cij in related_items.items():
self.W[i][j] = cij / (math.sqrt(N[i] * N[j]))
return self.W
def Recommend(self, user, K=3, N=10):
""" 给用户推荐物品,取相似度最大的K个物品,推荐排名靠前的10个物品 """
# 用户对物品的偏好值
rank = {}
# 用户产生过行为的物品项和评分
action_item = self.train[user]
# print(action_item)
for item, score in action_item.items():
#print(sorted(self.W[item].items(), key=lambda x:x[1], reverse=True))
#print("-----------------------------------------------------------------")
#print(sorted(self.W[item].items(), key=lambda x:x[1], reverse=True)[0:K])
for j, wj in sorted(self.W[item].items(), key=lambda x:x[1], reverse=True)[0:K]:
if j in action_item.keys():
continue
if j not in rank.keys():
rank.setdefault(j, 0)
rank[j] += score * wj
return sorted(rank.items(), key=lambda x:x[1], reverse=True)[0:N]
# 为用户3推荐未看过的10个电影
if __name__ == "__main__":
cf = ItemBasedCF("./ml-100k/u.data")
cf.ItemSimilarity()
print(cf.Recommend("3"))
偏好收集
显式:评分、投票、转发、保存书签、标记标签、评论
隐式:点击、页面停留、购买
评价指标:召回率、精确率、覆盖率、新颖度
import math
class UserBasedCF:
def __init__(self, train_file):
""" 初始化对象 """
self.train_file = train_file
self.readData()
self.UserSimilarity()
def readData(self):
""" 读取文件,并生成物品-用户,测试集 物品-用户的评分表 训练集 """
self.train = {}
# 打开文件,读取训练集
for line in open(self.train_file):
user, item, score, _ = line.strip().split("\t")
self.train.setdefault(user, {})
self.train[user][item] = int(score)
def UserSimilarity(self):
self.item_users = {}
for user, items in self.train.items():
for i in items.keys():
if i not in self.item_users.keys():
self.item_users.setdefault(i, set())
self.item_users[i].add(user)
C = {}
N = {}
for i, users in self.item_users.items():
for u in users:
if u not in N.keys():
N.setdefault(u, 0)
N[u] += 1
if u not in C.keys():
C.setdefault(u, {})
for v in users:
if u == v:
continue
if v not in C[u].keys():
C[u].setdefault(v, 0)
C[u][v] += 1
self.W = {}
for u, related_users in C.items():
if u not in self.W.keys():
self.W.setdefault(u, {})
for v, cuv in related_users.items():
self.W[u][v] = cuv / math.sqrt(N[u] * N[v])
return self.W
def Recommend(self, user, K=3, N=10):
rank = {}
action_item = self.train[user].keys()
for v, wuv in sorted(self.W[user].items(), key=lambda x:x[1], reverse=True)[0:K]:
for i, rvi in self.train[v].items():
if i in action_item:
continue
if i not in rank.keys():
rank.setdefault(i, 0)
rank[i] += rvi * wuv
return sorted(rank.items(), key=lambda x:x[1], reverse=True)[0:N]
if __name__ == "__main__":
cf = UserBasedCF("./ml-100k/u.data")
print(cf.Recommend("3"))
计算各种距离:scipy.spatial.distance.cdist
https://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.spatial.distance.cdist.html
# 这个傻老师讲的不对
import math
from scipy.spatial import distance
import numpy as np
class UserBasedCF:
def __init__(self, train_file):
""" 初始化对象 """
self.train_file = train_file
self.readData()
self.UserSimilarity()
def readData(self):
""" 读取文件,并生成物品-用户,测试集 物品-用户的评分表 训练集 """
self.train = {}
# 打开文件,读取训练集
for line in open(self.train_file):
user, item, score, _ = line.strip().split("\t")
self.train.setdefault(user, {})
self.train[user][item] = int(score)
def UserSimilarity(self):
self.item_users = {}
for user, items in self.train.items():
for i in items.keys():
if i not in self.item_users.keys():
self.item_users.setdefault(i, set())
self.item_users[i].add(user)
C = {}
N = {}
Cor = {}
for i, users in self.item_users.items():
for u in users:
if u not in N.keys():
N.setdefault(u, 0)
N[u] += 1
if u not in C.keys():
C.setdefault(u, {})
for v in users:
if u == v:
continue
if v not in C[u].keys():
C[u].setdefault(v, 0)
C[u][v] += 1
if u not in Cor.keys():
Cor.setdefault(u, [])
for v in users:
if u == v:
continue
Cor[u].append(v)
print(Cor[u])
self.W = {}
self.Euc = {}
for u, related_users in C.items():
if u not in self.W.keys():
self.W.setdefault(u, {})
if u not in self.Euc.keys():
self.Euc.setdefault(u, {})
for v, cuv in related_users.items():
self.W[u][v] = cuv / math.sqrt(N[u] * N[v])
if u in Cor.keys() and v in Cor.keys():
self.Euc[u][v] = np.sum(distance.cdist([Cor[u][0:10]], [Cor[v][0:10]], 'euclidean'))
print(self.Euc[u][v])
return self.W
def Recommend(self, user, K=3, N=10):
rank = {}
action_item = self.train[user].keys()
for v, wuv in sorted(self.W[user].items(), key=lambda x:x[1], reverse=True)[0:K]:
for i, rvi in self.train[v].items():
if i in action_item:
continue
if i not in rank.keys():
rank.setdefault(i, 0)
rank[i] += rvi * wuv
return sorted(rank.items(), key=lambda x:x[1], reverse=True)[0:N]
if __name__ == "__main__":
cf = UserBasedCF("./ml-100k/u.data")
print(cf.Recommend("3"))
from scipy.spatial import distance
a = np.array([[1,2,3,4,5,6]])
b = np.array([[6,5,4,3,2,1]])
c = [["1", "2"]]
print(distance.cdist(c,c,'euclidean'))
实现关联规则
1 Apriori
支持度、置信度越大,商品出现一起购买的次数就越多,可信度就越大
作用:找到商品购买记录中反复一起出现的商品,帮助营销人员做更好的策略,帮助顾客方便购买
策略:同时购买的商品放在一起、同时购买的商品放在两端
support(A->B) = P(AB)
confidence(A->B) = P(B|A)=P(AB)/P(A)
项集:项的集合,也就是商品的组合
K项集:K种商品的组合
频繁项集:如果项集的相对支持度满足给定的最小值支持度,则该项集是频繁项集
强关联规则:满足给定支持度与置信度阈值的关联规则
明确问题:
1.找到总是一起出现的物品
2.提出衡量标准:支持度、置信度阈值
3.支持度、置信度计算
4.频繁项集
5.由频繁项集找到强管关联规则
找关联规则 ——–> 频繁项集
1.找出所有的频繁项集:这个项集出现的次数至少与要求的最小计数一样
2.由频繁项集产生强关联规则:这些关联规则满足最小支持度与最小置信度
挑战:
1.多次数据库扫描
2.巨大数量的候补项集
3.繁琐的支持度计算
改善:
1.减少数据库扫描次数
2.减少候选项集的数量,提高支持度
3.简化候选项集的支持度计算
# 课程代码,不全
#coding:utf-8
from numpy import *
# 生成原始数据集用于测试
def LoadDataSet():
return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]
#遍历数据集每项物品,建立1-项集
def createC1(dataSet):
# 记录每项物品的列表
C1 = []
# 遍历每一条记录
for transaction in dataSet:
for item in transaction:
if [item] not in C1:
C1.append([item])
C1.sort()
print("C1 : " + str(C1))
# 将列表元素映射到frozenset()中,返回列表
# frozenset数据类型,不能被修改
return map(frozenset, C1)
# 输入:数据集D, 候选集CK, 最小支持度
# 候选集Ck由上一层的频繁项集Lk-1组合得到
# 用最小支持度minSupport对候选集Ck过滤
# 输出:第k层的频繁项集Lk,每项的支持度
def scanD(D, Ck, minSupport):
# 建立字典,key为候选集Ck中的每项, value为该物品在所有物品记录中出现的次数
ssCnt = {}
# 对比候选集中的每项与原物品记录,统计出现的次数
# 遍历每条物品记录
for tid in D:
# 遍历候选集中的每一项,便于对比
for can in Ck:
# 如果候选集Ck中,该项在该条物品记录出现
# 即当前项是当前物品记录的子集
if can.issubset(tid):
# 如果候选集Ck中该项被第一次统计到,次数记为1
if not ssCnt.has_key(can):ssCnt[can] = 1
# 否则在原有次数上加1
else: ssCnt[can] += 1
# 数据集中总的记录数,物品购买记录总数,用于计算支持度
numItems = float(len(D))
# 记录经最小支持度过滤后的频繁项集
retList = []
# key ---------> 候选集中满足条件的项
# value -------> 该项支持度
supportData = {}
for key in ssCnt:
# 计算每项的支持度
support = ssCnt[key] / numItems
# 最小支持度过滤
if support >= minSupport:
# 保留满足条件的物品集合
# 使用retList.insert(0, key)在列表首部插入新的集合
# 只是为了让列表看起来有组织
retList.insert(0, key)
# 记录该项的支持度
# 注意:候选集中所有项的支持度,也保留先来
# 不仅仅是满足最小支持度的项,其他项也被保留
supportData[key] = support
return retList, supportData
# 由上层频繁k-1项集生成候选k项集
# 如果输入为{0}, {1}, {2}, 会生成 {0,1}, {0,2}, {1,2}
# 输入:频繁k-1项集,新的候选集元素个数k
# 输出:候选集
def aprioriGen(Lk, k):
# 保存新的候选集
retList = []
# 输入的频繁项集记录数,用于循环遍历
lenLk = len(Lk)
# 比较频繁项集中的每项与其他项,若两项的前k-1个元素相同,那么就将两项合并
for i in range(lenLk):
# 遍历候选集中除前项后的其他项与当前项比较
for j in range(i+1, lenLk):
L1 = list(Lk[i])[:k-2]
# 候选集其余项的k-1个元素,每次只有其余项中的一项
L2 = list(Lk[j])[:k-2]
L1.sort()
L2.sort()
# 相同,两项合并
if L1 == L2:
retList.append(Lk[i] | Lk[j])
return retList
# 输入:数据集、最小支持度
def apriori(dataSet, minSupport=0.5):
print("function apriori ------------------- function apriori")
# 生成1项集
C1 = createC1(dataSet)
#对数据及进行映射至D, 去掉重复的数据记录
D = map(set, dataSet)
# 过滤最小支持度,得到频繁一项集L1以及每项的支持度
L1, supportData = scanD(D, C1, minSupport)
print("each element of L1")
for ll in L1:
print(ll)
# 将L1放入到列表L中, L=[L1, L2, L3]
# L存放所有的频繁项集
# 由L1产生L2, L2产生L3
L = [L1]
# python下标0表示第一个元素,k=2表示从1项集产生2项候选集
# L0为频繁1项集
k = 2
# 根据L1寻找L2、L3,通过while来完成
# 他创建包含更大项集的更大列表,直到下一个更大的项集为空
# 候选项集物品组合长度不超过元数据集原数据集最大的物品记录长度
# 如果原始数据集物品记录最大长度为4,候选集最多为4项集
while (len(L[k-2]) > 0):
# 由频繁k-1项集产生频繁k项集
# (连接步)
print("k : " + str(k))
print("L[k-2] : " + str(L[k-2]))
print("L : " + str(L))
Ck = apriorGen(L[k-2], k)
print("Ck : " + str(Ck))
# 由k项候选集经最小支持度筛选生成频繁k项集
# (剪枝步)
Lk, supK = scanD(D, Ck, minSupport)
# 更新支持度字典,用于加入新的支持度
print("support K")
print(supK)
# dict1.update(dict2) 函数把字典dict2的键/值对更新到dict里
supportData.update(supK)
# 将新的频繁k项集加入已有的频繁项集的列表中
L.append(Lk)
k += 1
return L, supportData
def
if __name__ = "__main__":
dataSet = LoadDataSet()
L, supportData = apriori(dataSet, minSupport=0.5)
print("LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLl")
print(L)
rules = generateRules(L, supportData, minconf=0.5)
print("rules rules rules rules rules rules rules rules rules rules rules rules")
print(rules)
# 测试map
from numpy import *
a = [[1,2,3,4,5,1,3,4],[123,312,3,41,4,1]]
b = map(set, a)
for i in b:
print(i)
print(type(i))
{1, 2, 3, 4, 5}
<class 'set'>
{1, 3, 4, 41, 312, 123}
<class 'set'>
# 测试sort前后变化
a = [1,2]
b = [2,1]
print(a == b)
print(a.sort() == b.sort())
print(a == b)
False
True
True
def load_data_set():
""" Load a sample data set (From Data Mining: Concepts and Techniques, 3th Edition) Returns: A data set: A list of transactions. Each transaction contains several items. """
data_set = [['l1', 'l2', 'l5', 'l3'], ['l2', 'l4'], ['l2', 'l3'],
['l1', 'l2', 'l4'], ['l1', 'l3'], ['l2', 'l3'],
['l1', 'l3'], ['l1', 'l2', 'l3', 'l5'], ['l1', 'l2', 'l3']]
return data_set
def create_C1(data_set):
""" Create frequent candidate 1-itemset C1 by scaning data set. Args: data_set: A list of transactions. Each transaction contains several items. Returns: C1: A set which contains all frequent candidate 1-itemsets """
C1 = set()
for t in data_set:
for item in t:
item_set = frozenset([item])
C1.add(item_set)
return C1
def is_apriori(Ck_item, Lksub1):
""" Judge whether a frequent candidate k-itemset satisfy Apriori property. Args: Ck_item: a frequent candidate k-itemset in Ck which contains all frequent candidate k-itemsets. Lksub1: Lk-1, a set which contains all frequent candidate (k-1)-itemsets. Returns: True: satisfying Apriori property. False: Not satisfying Apriori property. """
for item in Ck_item:
sub_Ck = Ck_item - frozenset([item])
if sub_Ck not in Lksub1:
return False
return True
def create_Ck(Lksub1, k):
""" Create Ck, a set which contains all all frequent candidate k-itemsets by Lk-1's own connection operation. Args: Lksub1: Lk-1, a set which contains all frequent candidate (k-1)-itemsets. k: the item number of a frequent itemset. Return: Ck: a set which contains all all frequent candidate k-itemsets. """
Ck = set()
len_Lksub1 = len(Lksub1)
list_Lksub1 = list(Lksub1)
for i in range(len_Lksub1):
for j in range(1, len_Lksub1):
l1 = list(list_Lksub1[i])
l2 = list(list_Lksub1[j])
l1.sort()
l2.sort()
if l1[0:k-2] == l2[0:k-2]:
Ck_item = list_Lksub1[i] | list_Lksub1[j]
# pruning
if is_apriori(Ck_item, Lksub1):
Ck.add(Ck_item)
return Ck
def generate_Lk_by_Ck(data_set, Ck, min_support, support_data):
""" Generate Lk by executing a delete policy from Ck. Args: data_set: A list of transactions. Each transaction contains several items. Ck: A set which contains all all frequent candidate k-itemsets. min_support: The minimum support. support_data: A dictionary. The key is frequent itemset and the value is support. Returns: Lk: A set which contains all all frequent k-itemsets. """
Lk = set()
item_count = {}
for t in data_set:
for item in Ck:
if item.issubset(t):
if item not in item_count:
item_count[item] = 1
else:
item_count[item] += 1
t_num = float(len(data_set))
for item in item_count:
if (item_count[item] / t_num) >= min_support:
Lk.add(item)
support_data[item] = item_count[item] / t_num
return Lk
def generate_L(data_set, k, min_support):
""" Generate all frequent itemsets. Args: data_set: A list of transactions. Each transaction contains several items. k: Maximum number of items for all frequent itemsets. min_support: The minimum support. Returns: L: The list of Lk. support_data: A dictionary. The key is frequent itemset and the value is support. """
support_data = {}
C1 = create_C1(data_set)
L1 = generate_Lk_by_Ck(data_set, C1, min_support, support_data)
Lksub1 = L1.copy()
L = []
L.append(Lksub1)
for i in range(2, k+1):
Ci = create_Ck(Lksub1, i)
Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data)
Lksub1 = Li.copy()
L.append(Lksub1)
return L, support_data
def generate_big_rules(L, support_data, min_conf):
""" Generate big rules from frequent itemsets. Args: L: The list of Lk. support_data: A dictionary. The key is frequent itemset and the value is support. min_conf: Minimal confidence. Returns: big_rule_list: A list which contains all big rules. Each big rule is represented as a 3-tuple. """
big_rule_list = []
sub_set_list = []
for i in range(0, len(L)):
for freq_set in L[i]:
for sub_set in sub_set_list:
if sub_set.issubset(freq_set):
conf = support_data[freq_set] / support_data[freq_set - sub_set]
big_rule = (freq_set - sub_set, sub_set, conf)
if conf >= min_conf and big_rule not in big_rule_list:
# print freq_set-sub_set, " => ", sub_set, "conf: ", conf
big_rule_list.append(big_rule)
sub_set_list.append(freq_set)
return big_rule_list
if __name__ == "__main__":
""" Test """
data_set = load_data_set()
L, support_data = generate_L(data_set, k=3, min_support=0.2)
big_rules_list = generate_big_rules(L, support_data, min_conf=0.7)
for Lk in L:
print ("="*50)
print ("frequent " + str(len(list(Lk)[0])) + "-itemsets\t\tsupport")
print ("="*50)
for freq_set in Lk:
print (freq_set, support_data[freq_set])
print()
print ("Big Rules")
for item in big_rules_list:
print (item[0], "=>", item[1], "conf: ", item[2])
==================================================
frequent 1-itemsets support
==================================================
frozenset({'l1'}) 0.6666666666666666
frozenset({'l2'}) 0.7777777777777778
frozenset({'l5'}) 0.2222222222222222
frozenset({'l3'}) 0.7777777777777778
frozenset({'l4'}) 0.2222222222222222
==================================================
frequent 2-itemsets support
==================================================
frozenset({'l5', 'l1'}) 0.2222222222222222
frozenset({'l4', 'l2'}) 0.2222222222222222
frozenset({'l3', 'l2'}) 0.5555555555555556
frozenset({'l5', 'l2'}) 0.2222222222222222
frozenset({'l1', 'l2'}) 0.4444444444444444
frozenset({'l3', 'l1'}) 0.5555555555555556
frozenset({'l3', 'l5'}) 0.2222222222222222
==================================================
frequent 3-itemsets support
==================================================
frozenset({'l3', 'l5', 'l2'}) 0.2222222222222222
frozenset({'l3', 'l5', 'l1'}) 0.2222222222222222
frozenset({'l3', 'l1', 'l2'}) 0.3333333333333333
frozenset({'l5', 'l1', 'l2'}) 0.2222222222222222
Big Rules
frozenset({'l5'}) => frozenset({'l1'}) conf: 1.0
frozenset({'l4'}) => frozenset({'l2'}) conf: 1.0
frozenset({'l3'}) => frozenset({'l2'}) conf: 0.7142857142857143
frozenset({'l2'}) => frozenset({'l3'}) conf: 0.7142857142857143
frozenset({'l5'}) => frozenset({'l2'}) conf: 1.0
frozenset({'l3'}) => frozenset({'l1'}) conf: 0.7142857142857143
frozenset({'l1'}) => frozenset({'l3'}) conf: 0.8333333333333334
frozenset({'l5'}) => frozenset({'l3'}) conf: 1.0
frozenset({'l3', 'l5'}) => frozenset({'l2'}) conf: 1.0
frozenset({'l5', 'l2'}) => frozenset({'l3'}) conf: 1.0
frozenset({'l5'}) => frozenset({'l3', 'l2'}) conf: 1.0
frozenset({'l3', 'l5'}) => frozenset({'l1'}) conf: 1.0
frozenset({'l5', 'l1'}) => frozenset({'l3'}) conf: 1.0
frozenset({'l5'}) => frozenset({'l3', 'l1'}) conf: 1.0
frozenset({'l1', 'l2'}) => frozenset({'l3'}) conf: 0.75
frozenset({'l5', 'l2'}) => frozenset({'l1'}) conf: 1.0
frozenset({'l5', 'l1'}) => frozenset({'l2'}) conf: 1.0
frozenset({'l5'}) => frozenset({'l1', 'l2'}) conf: 1.0
2 FPGrowth
Apriori多次扫描数据库,而FPGrowth扫描两次就可以。
第一次扫描获得单个项目的概率,去掉不满足支持度要求的项,并对剩下的项排序
第二次扫描建立一棵FP-Tree
from collections import defaultdict, Counter, deque
import math
import copy
class node:
def __init__(self, item, count, parent): # 本程序将节点之间的链接信息存储到项头表中,后续可遍历项头表添加该属性
self.item = item # 该节点的项
self.count = count # 项的计数
self.parent = parent # 该节点父节点的id
self.children = [] # 该节点的子节点的list
class FP:
def __init__(self, minsup=0.5):
self.minsup = minsup
self.minsup_num = None # 支持度计数
self.N = None
self.item_head = defaultdict(list) # 项头表
self.fre_one_itemset = defaultdict(lambda: 0) # 频繁一项集,值为支持度
self.sort_rules = None # 项头表中的项排序规则,按照支持度从大到小有序排列
self.tree = defaultdict() # fp树, 键为节点的id, 值为node
self.max_node_id = 0 # 当前树中最大的node_id, 用于插入新节点时,新建node_id
self.fre_itemsets = [] # 频繁项集
self.fre_itemsets_sups = [] # 频繁项集的支持度计数
def init_param(self, data):
self.N = len(data)
self.minsup_num = math.ceil(self.minsup * self.N)
self.get_fre_one_itemset(data)
self.build_tree(data)
return
def get_fre_one_itemset(self, data):
# 获取频繁1项,并排序,第一次扫描数据集
c = Counter()
for t in data:
c += Counter(t)
for key, val in c.items():
if val >= self.minsup_num:
self.fre_one_itemset[key] = val
sort_keys = sorted(self.fre_one_itemset, key=self.fre_one_itemset.get, reverse=True)
self.sort_rules = {k: i for i, k in enumerate(sort_keys)} # 频繁一项按照支持度降低的顺序排列,构建排序规则
return
def insert_item(self, parent, item):
# 将事务中的项插入到FP树中,并返回插入节点的id
children = self.tree[parent].children
for child_id in children:
child_node = self.tree[child_id]
if child_node.item == item:
self.tree[child_id].count += 1
next_node_id = child_id
break
else: # 循环正常结束,表明当前父节点的子节点中没有项与之匹配,所以新建子节点,更新项头表和树
self.max_node_id += 1
next_node_id = copy.copy(self.max_node_id) # 注意self.max_node_id 是可变的,引用时需要copy
self.tree[next_node_id] = node(item=item, count=1, parent=parent) # 更新树,添加节点
self.tree[parent].children.append(next_node_id) # 更新父节点的孩子列表
self.item_head[item].append(next_node_id) # 更新项头表
return next_node_id
def build_tree(self, data):
# 构建项头表以及FP树, 第二次扫描数据集
one_itemset = set(self.fre_one_itemset.keys())
self.tree[0] = node(item=None, count=0, parent=-1)
for t in data:
t = list(set(t) & one_itemset) # 去除该事务中非频繁项
if len(t) > 0:
t = sorted(t, key=lambda x: self.sort_rules[x]) # 按照项的频繁程度从大到小排序
parent = 0 # 每个事务都是从树根开始插起
for item in t:
parent = self.insert_item(parent, item) # 将排序后的事务中每个项依次插入FP树
return
def get_path(self, pre_tree, condition_tree, node_id, suffix_items_count):
# 根据后缀的某个叶节点的父节点出发,选取出路径,并更新计数。suffix_item_count为后缀的计数
if node_id == 0:
return
else:
if node_id not in condition_tree.keys():
current_node = copy.deepcopy(pre_tree[node_id])
current_node.count = suffix_items_count # 更新计数
condition_tree[node_id] = current_node
else: # 若叶节点有多个,则路径可能有重复,计数叠加
condition_tree[node_id].count += suffix_items_count
node_id = condition_tree[node_id].parent
self.get_path(pre_tree, condition_tree, node_id, suffix_items_count) # 递归构建路径
return
def get_condition_tree(self, pre_tree, suffix_items_ids):
# 构建后缀为一个项的条件模式基。可能对应多个叶节点,综合后缀的各个叶节点的路径
condition_tree = defaultdict() # 字典存储条件FP树,值为父节点
for suffix_id in suffix_items_ids: # 从各个后缀叶节点出发,综合各条路径形成条件FP树
suffix_items_count = copy.copy(pre_tree[suffix_id].count) # 叶节点计数
node_id = pre_tree[suffix_id].parent # 注意条件FP树不包括后缀
if node_id == 0:
continue
self.get_path(pre_tree, condition_tree, node_id, suffix_items_count)
return condition_tree
def extract_suffix_set(self, condition_tree, suffix_items):
# 根据条件模式基,提取频繁项集, suffix_item为该条件模式基对应的后缀
# 返回新的后缀,以及新添加项(将作为下轮的叶节点)的id
new_suffix_items_list = [] # 后缀中添加的新项
new_item_head = defaultdict(list) # 基于当前的条件FP树,更新项头表, 新添加的后缀项
item_sup_dict = defaultdict(int)
for key, val in condition_tree.items():
item_sup_dict[val.item] += val.count # 对项出现次数进行统计
new_item_head[val.item].append(key)
for item, sup in item_sup_dict.items():
if sup >= self.minsup_num: # 若条件FP树中某个项是频繁的,则添加到后缀中
current_item_set = [item] + suffix_items
self.fre_itemsets.append(current_item_set)
self.fre_itemsets_sups.append(sup)
new_suffix_items_list.append(current_item_set)
else:
new_item_head.pop(item)
return new_suffix_items_list, new_item_head.values()
def get_fre_set(self, data):
# 构建以每个频繁1项为后缀的频繁项集
self.init_param(data)
suffix_items_list = []
suffix_items_id_list = []
for key, val in self.fre_one_itemset.items():
suffix_items = [key]
suffix_items_list.append(suffix_items)
suffix_items_id_list.append(self.item_head[key])
self.fre_itemsets.append(suffix_items)
self.fre_itemsets_sups.append(val)
pre_tree = copy.deepcopy(self.tree) # pre_tree 是尚未去除任何后缀的前驱,若其叶节点的项有多种,则可以形成多种条件FP树
self.dfs_search(pre_tree, suffix_items_list, suffix_items_id_list)
return
def bfs_search(self, pre_tree, suffix_items_list, suffix_items_id_list):
# 宽度优先,递增构建频繁k项集
q = deque()
q.appendleft((pre_tree, suffix_items_list, suffix_items_id_list))
while len(q) > 0:
param_tuple = q.pop()
pre_tree = param_tuple[0]
for suffix_items, suffix_items_ids in zip(param_tuple[1], param_tuple[2]):
condition_tree = self.get_condition_tree(pre_tree, suffix_items_ids)
new_suffix_items_list, new_suffix_items_id_list = self.extract_suffix_set(condition_tree, suffix_items)
if new_suffix_items_list:
q.appendleft(
(condition_tree, new_suffix_items_list, new_suffix_items_id_list)) # 储存前驱,以及产生该前驱的后缀的信息
return
def dfs_search(self, pre_tree, suffix_items_list, suffix_items_id_list):
# 深度优先,递归构建以某个项为后缀的频繁k项集
for suffix_items, suffix_items_ids in zip(suffix_items_list, suffix_items_id_list):
condition_tree = self.get_condition_tree(pre_tree, suffix_items_ids)
new_suffix_items_list, new_suffix_items_id_list = self.extract_suffix_set(condition_tree, suffix_items)
if new_suffix_items_list: # 如果后缀有新的项添加进来,则继续深度搜索
self.dfs_search(condition_tree, new_suffix_items_list, new_suffix_items_id_list)
return
if __name__ == '__main__':
data1 = [list('ABCEFO'), list('ACG'), list('ET'), list('ACDEG'), list('ACEGL'),
list('EJ'), list('ABCEFP'), list('ACD'), list('ACEGM'), list('ACEGN')]
data2 = [list('ab'), list('bcd'), list('acde'), list('ade'), list('abc'),
list('abcd'), list('a'), list('abc'), list('abd'), list('bce')]
data3 = [['r', 'z', 'h', 'j', 'p'], ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'], ['z'], ['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'], ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
fp = FP(minsup=0.5)
fp.get_fre_set(data3)
for itemset, sup in zip(fp.fre_itemsets, fp.fre_itemsets_sups):
print(itemset, sup)
['z'] 5
['s'] 3
['t'] 3
['r'] 3
['x'] 4
['y'] 3
['x', 's'] 3
['z', 't'] 3
['x', 't'] 3
['z', 'x', 't'] 3
['z', 'x'] 3
['z', 'y'] 3
['x', 'y'] 3
['t', 'y'] 3
['z', 'x', 'y'] 3
['z', 't', 'y'] 3
['x', 't', 'y'] 3
['z', 'x', 't', 'y'] 3
https://github.com/Shi-Lixin/Machine-Learning-Algorithms
#coding:utf-8
from numpy import *
class treeNode:
def __init__(self, nameValue, numOccur, parentNode):
self.name = nameValue
self.count = numOccur
self.nodeLink = None
self.parent = parentNode #needs to be updated
self.children = {}
def inc(self,numOccur):
self.count += numOccur
def disp(self,ind = 1):
print (' '*ind,self.name,' ',self.count)
for child in self.children.values():
child.disp(ind+1)
#test
rootNode = treeNode('pyramid',9,None)
rootNode.children['eye'] = treeNode('eye',13,None)
a = rootNode.disp()
print(a)
pyramid 9
eye 13
None
class treeNode:
def __init__(self, nameValue, numOccur, parentNode):
self.name = nameValue
self.count = numOccur
self.nodeLink = None
# needs to be updated
self.parent = parentNode
self.children = {}
def inc(self, numOccur):
"""inc(对count变量增加给定值) """
self.count += numOccur
def disp(self, ind=1):
"""disp(用于将树以文本形式显示) """
print(' '*ind, self.name, ' ', self.count)
for child in self.children.values():
child.disp(ind+1)
def __lt__(self, other):
return self.count < other.count
def loadSimpDat():
simpDat = [['r', 'z', 'h', 'j', 'p'],
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'],
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
return simpDat
def createInitSet(dataSet):
retDict = {}
for trans in dataSet:
if frozenset(trans) not in retDict:
retDict[frozenset(trans)] = 1
else:
retDict[frozenset(trans)] += 1
return retDict
# this version does not use recursion
def updateHeader(nodeToTest, targetNode):
"""updateHeader(更新头指针,建立相同元素之间的关系,例如: 左边的r指向右边的r值,就是后出现的相同元素 指向 已经出现的元素) 从头指针的nodeLink开始,一直沿着nodeLink直到到达链表末尾。这就是链表。 性能:如果链表很长可能会遇到迭代调用的次数限制。 Args: nodeToTest 满足minSup {所有的元素+(value, treeNode)} targetNode Tree对象的子节点 """
# 建立相同元素之间的关系,例如: 左边的r指向右边的r值
while (nodeToTest.nodeLink is not None):
nodeToTest = nodeToTest.nodeLink
nodeToTest.nodeLink = targetNode
def updateTree(items, inTree, headerTable, count):
"""updateTree(更新FP-tree,第二次遍历) # 针对每一行的数据 # 最大的key, 添加 Args: items 满足minSup 排序后的元素key的数组(大到小的排序) inTree 空的Tree对象 headerTable 满足minSup {所有的元素+(value, treeNode)} count 原数据集中每一组Kay出现的次数 """
# 取出 元素 出现次数最高的
# 如果该元素在 inTree.children 这个字典中,就进行累加
# 如果该元素不存在 就 inTree.children 字典中新增key,value为初始化的 treeNode 对象
if items[0] in inTree.children:
# 更新 最大元素,对应的 treeNode 对象的count进行叠加
inTree.children[items[0]].inc(count)
else:
# 如果不存在子节点,我们为该inTree添加子节点
inTree.children[items[0]] = treeNode(items[0], count, inTree)
# 如果满足minSup的dist字典的value值第二位为null, 我们就设置该元素为 本节点对应的tree节点
# 如果元素第二位不为null,我们就更新header节点
if headerTable[items[0]][1] is None:
# headerTable只记录第一次节点出现的位置
headerTable[items[0]][1] = inTree.children[items[0]]
else:
# 本质上是修改headerTable的key对应的Tree,的nodeLink值
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
if len(items) > 1:
# 递归的调用,在items[0]的基础上,添加item0[1]做子节点, count只要循环的进行累计加和而已,统计出节点的最后的统计值。
updateTree(items[1:], inTree.children[items[0]], headerTable, count)
def createTree(dataSet, minSup=1):
"""createTree(生成FP-tree) Args: dataSet dist{行:出现次数}的样本数据 minSup 最小的支持度 Returns: retTree FP-tree headerTable 满足minSup {所有的元素+(value, treeNode)} """
# 支持度>=minSup的dist{所有元素:出现的次数}
headerTable = {}
# 循环 dist{行:出现次数}的样本数据
for trans in dataSet:
# 对所有的行进行循环,得到行里面的所有元素
# 统计每一行中,每个元素出现的总次数
for item in trans:
# 例如: {'ababa': 3} count(a)=3+3+3=9 count(b)=3+3=6
headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
# 删除 headerTable中,元素次数<最小支持度的元素
temp_keys = list(headerTable.keys())
for k in temp_keys:
if headerTable[k] < minSup:
del(headerTable[k])
# 满足minSup: set(各元素集合)
freqItemSet = set(headerTable.keys())
# 如果不存在,直接返回None
if len(freqItemSet) == 0:
return None, None
for k in headerTable:
# 格式化: dist{元素key: [元素次数, None]}
headerTable[k] = [headerTable[k], None]
# create tree
retTree = treeNode('Null Set', 1, None)
# 循环 dist{行:出现次数}的样本数据
for tranSet, count in dataSet.items():
# print 'tranSet, count=', tranSet, count
# localD = dist{元素key: 元素总出现次数}
localD = {}
for item in tranSet:
# 判断是否在满足minSup的集合中
if item in freqItemSet:
# print 'headerTable[item][0]=', headerTable[item][0], headerTable[item]
localD[item] = headerTable[item][0]
# print 'localD=', localD
if len(localD) > 0:
# p=key,value; 所以是通过value值的大小,进行从大到小进行排序
# orderedItems 表示取出元组的key值,也就是字母本身,但是字母本身是大到小的顺序
orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]
# print 'orderedItems=', orderedItems, 'headerTable', headerTable, '\n\n\n'
# 填充树,通过有序的orderedItems的第一位,进行顺序填充 第一层的子节点。
updateTree(orderedItems, retTree, headerTable, count)
return retTree, headerTable
def ascendTree(leafNode, prefixPath):
"""ascendTree(如果存在父节点,就记录当前节点的name值) Args: leafNode 查询的节点对于的nodeTree prefixPath 要查询的节点值 """
if leafNode.parent is not None:
prefixPath.append(leafNode.name)
ascendTree(leafNode.parent, prefixPath)
def findPrefixPath(basePat, treeNode):
"""findPrefixPath 基础数据集 Args: basePat 要查询的节点值 treeNode 查询的节点所在的当前nodeTree Returns: condPats 对非basePat的倒叙值作为key,赋值为count数 """
#print("basePat %s" %(basePat))
#print("treeNode %s" %(treeNode))
condPats = {}
# 对 treeNode的link进行循环
while treeNode is not None:
prefixPath = []
# 寻找改节点的父节点,相当于找到了该节点的频繁项集
ascendTree(treeNode, prefixPath)
#print(prefixPath)
# 避免 单独`Z`一个元素,添加了空节点
if len(prefixPath) > 1:
print(prefixPath)
# 对非basePat的倒叙值作为key,赋值为count数
# prefixPath[1:] 变frozenset后,字母就变无序了
# condPats[frozenset(prefixPath)] = treeNode.count
condPats[frozenset(prefixPath[1:])] = treeNode.count
# 递归,寻找改节点的下一个 相同值的链接节点
treeNode = treeNode.nodeLink
#print (treeNode)
return condPats
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
"""mineTree(创建条件FP树) Args: inTree myFPtree headerTable 满足minSup {所有的元素+(value, treeNode)} minSup 最小支持项集 preFix preFix为newFreqSet上一次的存储记录,一旦没有myHead,就不会更新 freqItemList 用来存储频繁子项的列表 """
# 通过value进行从小到大的排序, 得到频繁项集的key
# 最小支持项集的key的list集合
bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[1])]
print('-----', sorted(headerTable.items(), key=lambda p: p[1]))
print('bigL=', bigL)
# 循环遍历 最频繁项集的key,从小到大的递归寻找对应的频繁项集
for basePat in bigL:
# preFix为newFreqSet上一次的存储记录,一旦没有myHead,就不会更新
newFreqSet = preFix.copy()
newFreqSet.add(basePat)
print('newFreqSet=', newFreqSet, preFix)
freqItemList.append(newFreqSet)
print('freqItemList=', freqItemList)
condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
print('condPattBases=', basePat, condPattBases)
# 构建FP-tree
myCondTree, myHead = createTree(condPattBases, minSup)
print('myHead=', myHead)
# 挖掘条件 FP-tree, 如果myHead不为空,表示满足minSup {所有的元素+(value, treeNode)}
if myHead is not None:
myCondTree.disp(1)
print('\n\n\n')
# 递归 myHead 找出频繁项集
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
print('\n\n\n')
#test
data1 = [list('ABCEFO'), list('AG'), list('ET'), list('ACDEG'), list('ACEGL'),
list('EJ'), list('ABCEFP'), list('ACD'), list('ACEGM'), list('ACEGN')]
data2 = [list('ab'), list('bcd'), list('acde'), list('ade'), list('abc'),
list('abcd'), list('a'), list('abc'), list('abd'), list('bce')]
data3 = [['r', 'z', 'h', 'j', 'p'], ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'], ['z'], ['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'], ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
#print(data1)
#simpDat = loadSimpDat()
initSet = createInitSet(data3)
myFPtree,myHeaderTab = createTree(initSet,3)
a = myFPtree.disp()
#b = findPrefixPath('C',myHeaderTab['C'][1])
#print (b)
print(findPrefixPath('x',myHeaderTab['x'][1]))
#freqItems = []
#mineTree(myFPtree,myHeaderTab,2,set([]),freqItems)
#for x in freqItems:
# print(x) # 打印频繁项集
Null Set 1
x 1
r 1
s 1
z 5
r 1
x 3
t 3
r 1
y 1
s 2
y 2
['x', 'z']
['x', 'z']
['x']
{frozenset({'z'}): 3}
从新闻网站点击流中挖掘https://www.cnblogs.com/ybjourney/p/4851540.html
上述在自定义的数据中队算法进行了验证,现在选取实际的数据进行测试。在这个数据集合中,包含了100万条记录,文件中的每一行包含某个用户浏览过的新闻报道,用来寻找那些至少被10万人浏览过的报道。代码如下:
从新闻网站点击流中挖掘
parsedData = [line.split() for line in open(‘kosarak.dat’).readlines()]
initSet = createInitSet(parsedData)
myFPtree,myHeaderTab = createTree(initSet,100000)
myFreqList = []
a = mineTree(myFPtree,myHeaderTab,100000,set([]),myFreqList)
b = len(myFreqList)
print b
print myFreqList
慕课—代码不可用,但解释很合理:https://www.imooc.com/article/33990