数据探索
计算相关系数
为了更加准确地描述变量之间的线性相关程度,可以通过计算相关系统来进行相关分析。
在二元变量的相关分析过程中比较常用的有Pearson相关系数,Spearman秩相关系数和判定系数。
皮尔逊相关系数(Pearson Correlation Coefficient)
一般用于分析两个连续性变量之间的关系,其计算公式如下。
r = ∑ i = 1 n ( x i − x ‾ ) ( y i − y ‾ ) ∑ i = 1 n ( x i − x ‾ ) 2 ∑ i = 1 n ( y i − y ‾ ) 2 r = { \sum_{i=1}^{n}(xi-\overline{x})(yi-\overline{y})\over\sqrt{\sum_{i=1}^{n}(xi-\overline{x})^2\sum_{i=1}^{n}(yi-\overline{y})^2}} r=∑i=1n(xi−x)2∑i=1n(yi−y)2 ∑i=1n(xi−x)(yi−y)
相关系数r的取值范围:-1 <= r <= 1
{ r > 0 为 正 相 关 , r < 0 为 负 相 关 ∣ r ∣ = 0 表 示 不 存 在 线 性 关 系 ∣ r ∣ = 1 表 示 完 全 线 性 相 关 \begin{cases} r > 0 为正相关,r<0为负相关\\ |r| = 0 表示不存在线性关系\\ |r| = 1 表示完全线性相关 \end{cases} ⎩⎪⎨⎪⎧r>0为正相关,r<0为负相关∣r∣=0表示不存在线性关系∣r∣=1表示完全线性相关
0<|r|<1表示存在不同程度线性相关
{ ∣ r ∣ < = 0.3 为 不 存 在 线 性 相 关 0.3 < ∣ r ∣ < = 0.5 为 低 度 线 性 相 关 0.5 < ∣ r ∣ < = 0.8 为 显 著 线 性 相 关 ∣ r ∣ > 0.8 为 高 度 线 性 相 关 \begin{cases} |r|<=0.3为不存在线性相关\\ 0.3<|r|<=0.5为低度线性相关\\ 0.5<|r|<=0.8为显著线性相关\\ |r|>0.8为高度线性相关 \end{cases} ⎩⎪⎪⎪⎨⎪⎪⎪⎧∣r∣<=0.3为不存在线性相关0.3<∣r∣<=0.5为低度线性相关0.5<∣r∣<=0.8为显著线性相关∣r∣>0.8为高度线性相关
近似计算公式
上面的公式有一个问题在于算法时可能需要对数据进行多遍扫描。幸运的是,对于算法实现人员而言,还有一个皮尔逊相关系数的近似计算公式:
r = ∑ i = 1 n x i y i − ∑ i = 1 n x i ∑ i = 1 n y i n ∑ i = 1 n x i 2 − ( ∑ i = 1 n x i ) 2 n ∑ i = 1 n y i 2 − ( ∑ i = 1 n y i ) 2 n r=\frac{\sum_{i=1}^nx_iy_i-\frac{\sum_{i=1}^nx_i\sum_{i=1}^ny_i}{n}}{\sqrt{\sum_{i=1}^nx_i^2-\frac{(\sum_{i=1}^nx_i)^2}{n}}\sqrt{\sum_{i=1}^ny_i^2-\frac{(\sum_{i=1}^ny_i)^2}{n}}} r=∑i=1nxi2−n(∑i=1nxi)2 ∑i=1nyi2−n(∑i=1nyi)2 ∑i=1nxiyi−n∑i=1nxi∑i=1nyi
Python
def pearson(rating1, rating2):
sum_xy = 0
sum_x = 0
sum_y = 0
sum_x2 = 0
sum_y2 = 0
n = 0
for key in rating1:
if key in rating2:
n += 1
x = rating1[key]
y = rating2[key]
sum_xy += x * y
sum_x += x
sum_y += y
sum_x2 += pow(x, 2)
sum_y2 += pow(y, 2)
# now compute denominator
denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n)
if denominator == 0:
return 0
else:
return (sum_xy - (sum_x * sum_y) / n) / denominator
Spearman秩相关系数
Pearson线性相关系数要求连续变量的取值服从正太分布。不服从正态分布的变量、分类或等级变量之间的关联性可采用Spearman秩相关系数,也称等级相关系数来描述。
其计算公式如下:
r = 1 − 6 ∑ i = 1 n ( R i − Q i ) 2 n ( n 2 − 1 ) r={1-{ {6\sum_{i=1}^{n}(Ri-Qi)^2}\over{n(n^2-1)}}} r=1−n(n2−1)6∑i=1n(Ri−Qi)2
研究表明,在正态分布假设下,Spearman秩相关系数与Pearson相关系数在效率上是等价的,而对于连续测量数据,更适合用Pearson相关系数来进行分析。
判定系数
判定系数是相关系数的平方,用 r 2 r^2 r2表示;用来衡量回归方程对y的解释程度。
判定系数取值范围:0<= r 2 r^2 r2<=1, r 2 r^2 r2越接近于1,表示x与y之间的相关性越强;
r 2 r^2 r2越接近于0,表明两个变量之间几乎没有直线相关关系。
相似度的选择
如果数据受分数贬值(grade-inflation,即不同用户使用不同的评级范围)的影响,则使用皮尔逊相关系数。
如果数据稠密(几乎所有属性都没有零值)且属性值大小十分重要,那么使用诸如欧式距离或者曼哈顿距离。
如果数据稀疏,考虑使用余弦相似度。
相关代码
import codecs
from math import sqrt
users = { "Angelica": { "Blues Traveler": 3.5, "Broken Bells": 2.0,
"Norah Jones": 4.5, "Phoenix": 5.0,
"Slightly Stoopid": 1.5,
"The Strokes": 2.5, "Vampire Weekend": 2.0},
"Bill":{ "Blues Traveler": 2.0, "Broken Bells": 3.5,
"Deadmau5": 4.0, "Phoenix": 2.0,
"Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
"Chan": { "Blues Traveler": 5.0, "Broken Bells": 1.0,
"Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5,
"Slightly Stoopid": 1.0},
"Dan": { "Blues Traveler": 3.0, "Broken Bells": 4.0,
"Deadmau5": 4.5, "Phoenix": 3.0,
"Slightly Stoopid": 4.5, "The Strokes": 4.0,
"Vampire Weekend": 2.0},
"Hailey": { "Broken Bells": 4.0, "Deadmau5": 1.0,
"Norah Jones": 4.0, "The Strokes": 4.0,
"Vampire Weekend": 1.0},
"Jordyn": { "Broken Bells": 4.5, "Deadmau5": 4.0,
"Norah Jones": 5.0, "Phoenix": 5.0,
"Slightly Stoopid": 4.5, "The Strokes": 4.0,
"Vampire Weekend": 4.0},
"Sam": { "Blues Traveler": 5.0, "Broken Bells": 2.0,
"Norah Jones": 3.0, "Phoenix": 5.0,
"Slightly Stoopid": 4.0, "The Strokes": 5.0},
"Veronica": { "Blues Traveler": 3.0, "Norah Jones": 5.0,
"Phoenix": 4.0, "Slightly Stoopid": 2.5,
"The Strokes": 3.0}
}
class recommender:
def __init__(self, data, k=1, metric='pearson', n=5):
""" initialize recommender currently, if data is dictionary the recommender is initialized to it. For all other data types of data, no initialization occurs k is the k value for k nearest neighbor metric is which distance formula to use n is the maximum number of recommendations to make"""
self.k = k
self.n = n
self.username2id = { }
self.userid2name = { }
self.productid2name = { }
# for some reason I want to save the name of the metric
self.metric = metric
if self.metric == 'pearson':
self.fn = self.pearson
#
# if data is dictionary set recommender data to it
#
if type(data).__name__ == 'dict':
self.data = data
def convertProductID2name(self, id):
"""Given product id number return product name"""
if id in self.productid2name:
return self.productid2name[id]
else:
return id
def userRatings(self, id, n):
"""Return n top ratings for user with id"""
print ("Ratings for " + self.userid2name[id])
ratings = self.data[id]
print(len(ratings))
ratings = list(ratings.items())
ratings = [(self.convertProductID2name(k), v)
for (k, v) in ratings]
# finally sort and return
ratings.sort(key=lambda artistTuple: artistTuple[1],
reverse = True)
ratings = ratings[:n]
for rating in ratings:
print("%s\t%i" % (rating[0], rating[1]))
def loadBookDB(self, path=''):
"""loads the BX book dataset. Path is where the BX files are located"""
self.data = { }
i = 0
#
# First load book ratings into self.data
#
f = codecs.open(path + "BX-Book-Ratings.csv", 'r', 'utf8')
for line in f:
i += 1
#separate line into fields
fields = line.split(';')
user = fields[0].strip('"')
book = fields[1].strip('"')
rating = int(fields[2].strip().strip('"'))
if user in self.data:
currentRatings = self.data[user]
else:
currentRatings = { }
currentRatings[book] = rating
self.data[user] = currentRatings
f.close()
#
# Now load books into self.productid2name
# Books contains isbn, title, and author among other fields
#
f = codecs.open(path + "BX-Books.csv", 'r', 'utf8')
for line in f:
i += 1
#separate line into fields
fields = line.split(';')
isbn = fields[0].strip('"')
title = fields[1].strip('"')
author = fields[2].strip().strip('"')
title = title + ' by ' + author
self.productid2name[isbn] = title
f.close()
#
# Now load user info into both self.userid2name and
# self.username2id
#
f = codecs.open(path + "BX-Users.csv", 'r', 'utf8')
for line in f:
i += 1
#print(line)
#separate line into fields
fields = line.split(';')
userid = fields[0].strip('"')
location = fields[1].strip('"')
if len(fields) > 3:
age = fields[2].strip().strip('"')
else:
age = 'NULL'
if age != 'NULL':
value = location + ' (age: ' + age + ')'
else:
value = location
self.userid2name[userid] = value
self.username2id[location] = userid
f.close()
print(i)
def pearson(self, rating1, rating2):
sum_xy = 0
sum_x = 0
sum_y = 0
sum_x2 = 0
sum_y2 = 0
n = 0
for key in rating1:
if key in rating2:
n += 1
x = rating1[key]
y = rating2[key]
sum_xy += x * y
sum_x += x
sum_y += y
sum_x2 += pow(x, 2)
sum_y2 += pow(y, 2)
if n == 0:
return 0
# now compute denominator
denominator = (sqrt(sum_x2 - pow(sum_x, 2) / n)
* sqrt(sum_y2 - pow(sum_y, 2) / n))
if denominator == 0:
return 0
else:
return (sum_xy - (sum_x * sum_y) / n) / denominator
def computeNearestNeighbor(self, username):
"""creates a sorted list of users based on their distance to username"""
distances = []
for instance in self.data:
if instance != username:
distance = self.fn(self.data[username],
self.data[instance])
distances.append((instance, distance))
# sort based on distance -- closest first
distances.sort(key=lambda artistTuple: artistTuple[1],
reverse=True)
return distances
def recommend(self, user):
"""Give list of recommendations"""
recommendations = { }
# first get list of users ordered by nearness
nearest = self.computeNearestNeighbor(user)
#
# now get the ratings for the user
#
userRatings = self.data[user]
#
# determine the total distance
totalDistance = 0.0
for i in range(self.k):
totalDistance += nearest[i][1]
# now iterate through the k nearest neighbors
# accumulating their ratings
for i in range(self.k):
# compute slice of pie
weight = nearest[i][1] / totalDistance
# get the name of the person
name = nearest[i][0]
# get the ratings for this person
neighborRatings = self.data[name]
# get the name of the person
# now find bands neighbor rated that user didn't
for artist in neighborRatings:
if not artist in userRatings:
if artist not in recommendations:
recommendations[artist] = (neighborRatings[artist]
* weight)
else:
recommendations[artist] = (recommendations[artist]
+ neighborRatings[artist]
* weight)
# now make list from dictionary
recommendations = list(recommendations.items())
recommendations = [(self.convertProductID2name(k), v)
for (k, v) in recommendations]
# finally sort and return
recommendations.sort(key=lambda artistTuple: artistTuple[1],
reverse = True)
# Return the first n items
return recommendations[:self.n]