基于用户的协同过滤算法在用户增长的时候,相似度计算的计算会越来越困难。基于物品的算法给用户推荐他们之前喜欢的物品相似的物品。
算法步骤
计算物品之间的相似度
根据物品的相似度和用户的历史行为给用户生成推荐列表
相似度公式如下:wij=|N(i)∩N(j)||N(i)||N(j)|−−−−−−−−−−√
该公式减轻了热门物品和很多物品相似的可能性。在计算相似度的时候,建立倒排表。相似度的计算与基于用户的相同,不再赘述。- 其中N(u)是用户喜欢的物品的集合,S(j,k)是和物品J最相似的K个物品的集合,wji是物品i和j的相似度。
- 相似度的归一化可以提高推荐的多样性和覆盖率。
计算相似度之后,u对物品j的兴趣如下:
用户活跃度对物品相似度的影响
考虑到并不是每个用户对推荐的贡献度是一样的,例如某书店店主买了当当上的80%的书存货,这80%的书都会产生关联,但是并不是根据用户的兴趣爱好关联的。我们要降低这个用户的贡献度,引入IUF(inverse user frequence),即用户活跃度对数的倒数。利用IUF修正物品相似度的计算:
物品相似度归一化
Karypis在研究中心发现如果将相似度矩阵按照最大值归一化会提高推荐的准确率。即
完整代码如下
import random
import sys
import math
import os
from operator import itemgetter
random.seed(0)
class ItemBasedCF(object):
def __init__(self):
self.trainset = {}
self.testset = {}
self.n_sim_movie = 20
self.n_rec_movie = 10
self.movie_sim_mat = {}
self.movie_popular = {}
self.movie_count = 0
print('Similar movie number = %d' % self.n_sim_movie, file = sys.stderr)
print('Recommendend movie number = %d' % self.n_rec_movie,file = sys.stderr)
@staticmethod
def loadfile(filename):
fp = open(filename, 'r')
for i, line in enumerate(fp):
yield line.strip('\r\n')
if i % 100000 == 0:
print ('load %s(%s)' %(filename,i), file = sys.stderr)
fp.close()
print('load %s succ' %filename, file = sys.stderr)
def generate_dataset(self, filename, pivot = 0.7):
trainset_len = 0
testset_len = 0
for line in self.loadfile(filename):
user, movie, rating , _= line.split('::')
if random.random() < pivot:
self.trainset.setdefault(user,{})
self.trainset[user][movie] = int(rating)
trainset_len += 1
else:
self.testset.setdefault(user,{})
self.testset[user][movie] = int(rating)
testset_len += 1
print('split succ , trainset is %d , testset is %d' %(trainset_len,testset_len) , file = sys.stderr)
def calc_movie_sim(self):
for user, movies in self.trainset.items():
for movie in movies:
if movie not in self.movie_popular:
self.movie_popular[movie] = 0
self.movie_popular[movie] += 1
print('count movies number and pipularity succ',file = sys.stderr)
self.movie_count = len(self.movie_popular)
print('total movie number = %d' %self.movie_count, file = sys.stderr)
itemsim_mat = self.movie_sim_mat
print('building co-rated users matrix', file = sys.stderr)
for user, movies in self.trainset.items():
for m1 in movies:
for m2 in movies:
if m1 == m2:
continue
itemsim_mat.setdefault(m1,{})
itemsim_mat[m1].setdefault(m2,0)
itemsim_mat[m1][m2] += 1
print('build co-rated users matrix succ', file = sys.stderr)
print('calculating movie similarity matrix', file = sys.stderr)
simfactor_count = 0
PRINT_STEP = 2000000
for m1, related_movies in itemsim_mat.items():
for m2, count in related_movies.items():
itemsim_mat[m1][m2] = count / math.sqrt(self.movie_popular[m1] * self.movie_popular[m2])
simfactor_count += 1
if simfactor_count % PRINT_STEP == 0:
print('calcu movie similarity factor(%d)' %simfactor_count, file = sys.stderr)
print('calcu similiarity succ', file = sys.stderr)
def recommend(self,user):
K = self.n_sim_movie
N = self.n_rec_movie
rank = {}
watched_movies = self.trainset[user]
for movie, rating in watched_movies.items():
for related_movie, similarity_factor in sorted(self.movie_sim_mat[movie].items(), key=itemgetter(1),
reverse=True)[0:K]:
if related_movie in watched_movies:
continue
rank.setdefault(related_movie, 0)
rank[related_movie] += similarity_factor * rating
return sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N]
def evaluate(self):
print('evaluation start', file = sys.stderr)
N = self.n_rec_movie
hit = 0
rec_count = 0
test_count = 0
all_rec_movies = set()
popular_sum = 0
for i, user in enumerate(self.trainset):
if i % 500 == 0:
print('recommend for %d users ' %i , file = sys.stderr)
test_movies = self.testset.get(user,{})
rec_movies = self.recommend(user)
for movie, _ in rec_movies:
if movie in test_movies:
hit += 1
all_rec_movies.add(movie)
popular_sum += math.log(1 + self.movie_popular[movie])
rec_count += N
test_count += len(test_movies)
precision = hit / (1.0 * rec_count)
recall = hit / (1.0 * test_count)
coverage = len(all_rec_movies) / (1.0 * self.movie_count)
popularity = popular_sum / (1.0 * rec_count)
print('precision is %.4f\t recall is %.4f \t coverage is %.4f \t popularity is %.4f'
%(precision,recall,coverage,popularity), file = sys.stderr)
if __name__ == '__main__':
ratingfile = os.path.join('ml-1m', 'ratings.dat')
itemcf = ItemBasedCF()
itemcf.generate_dataset(ratingfile)
itemcf.calc_movie_sim()
itemcf.evaluate()
点击查看更多内容
为 TA 点赞
评论
共同学习,写下你的评论
评论加载中...
作者其他优质文章
正在加载中
感谢您的支持,我会继续努力的~
扫码打赏,你说多少就多少
赞赏金额会直接到老师账户
支付方式
打开微信扫一扫,即可进行扫码打赏哦