上一次修改时间：2018-07-27 04:37:39

事件推荐示例代码

kaggle event 推荐比赛

Event Recommendation Engine Challenge:预测用户是否会对某个活动感兴趣，based on events they've responded to in the past user demographic information what events they've seen and clicked on in our app

官网：https://www.kaggle.com/c/event-recommendation-engine-challenge

简单解决方案： 1.数据清洗与预处理 2.构建特征(包括协同过滤推荐度等复杂特征) 3.建模 4.生成提交的测试结果

导入工具包

#说明：推荐系统的数据量一般都很大，用pandas读数据的话，单机内存可能放不下，
#此处将每一步的处理结果存成文件，需要用时再load进内存，以节省内存的使用量
#特别说明：该文件里跑的数据是在原数据的基础上截取的一小部分，只用于测试代码
from __future__ import division
import itertools

#保存数据
import pickle#以二进制读写数据，相对较快

#以下四个是用于特征编码的
import datetime
import hashlib
import locale
import pycountry

import numpy as np
import scipy.io as sio
import scipy.sparse as ss

import scipy.spatial.distance as ssd#用于计算相似度/距离

from collections import defaultdict
from sklearn.preprocessing import normalize

1.数据清洗类

#类别型特征编码
class DataCleaner:
    def __init__(self):
        
        # 载入 locales
        self.localeIdMap = defaultdict(int)
        for i , l in enumerate(locale.locale_alias.keys()):
            self.localeIdMap[l] = i + 1
            
        # 载入 countries
        self.countryIdMap = defaultdict(int)
        ctryIdx = defaultdict(int)
        for i , c in enumerate(pycountry.countries):
            self.countryIdMap[c.name.lower()] = i + 1
            if c.name.lower() == 'usa':
                ctryIdx['US'] = i
            if c.name.lower() == 'canada':
                ctryIdx['CA'] = i
        for cc in ctryIdx.keys():
            for s in pycountry.subdivisions.get(country_code=cc):
                self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
                
        #载入gender id 字典
        self.genderIdMap = defaultdict(int , {"male":1 , "female":2})
        
    def getLocaleId(self , locstr):
        return self.localeIdMap[locstr.lower()]
    
    def getGenderId(self , genderStr):
        return self.genderIdMap[genderStr]
    
    def getJoinedYearMonth(self , dateString):
        dttm = datetime.datetime.strptime(dateString , "%Y-%m-%dT%H:%M:%S.%fZ")
        return "".join([str(dttm.year) , str(dttm.month)])
    
    def getCountryId(self , location):
        if(isinstance(location , str) and len(location.strip()) > 0 and location.rfind("  ") > -1):
            return self.countryIdMap[location[location.rindex(" ") + 2:].lower()]
        else:
            return 0
        
    def getBirthYearInt(self , birthYear):
        try:
            return 0 if birthYear == "None" else int(birthYear)
        except:
            return 0
        
    def getTimezoneInt(self , timezone):
        try:
            return int(timezone)
        except:
            return 0
        
    def getFeatureHash(self , value):
        if len(value.strip()) == 0:
            return -1
        else:
            return int(hashlib.sha224(value.encode("utf8")).hexdigest()[0:4] , 16)
    
    def getFloatValue(self , value):
        if len(value.strip()) == 0:
            return 0.0
        else:
            return float(value)

2.处理user和event关联数据

class ProgramEntities:
    #我们只关心train和test中出现的user和event,因此重点处理这部分关联数据
    
    def __init__(self):
        
        #统计训练集中有多少独立的用户的events
        uniqueUsers = set()#因为集合里的元素是不能重复的，此处用集合用于去重
        uniqueEvents = set()
        eventsForUser = defaultdict(set)
        usersForEvent = defaultdict(set)
        
        #构造用户和事件关系的矩阵
        for filename in ['./recommendData/train_test.csv' , './recommendData/test_test.csv']:
            f = open(filename , 'r')
            #打印文件读取的格式
            fileTitle = f.readline()
            fileTitle = fileTitle.strip().split(',')
            print("文件的格式为：\n",fileTitle)
            rows = 1
            for line in f:
                cols = line.strip().split(",")
                #打印文件的前五行数据
                if rows < 6:
                    print(cols)
                rows += 1
                uniqueUsers.add(cols[0])#数据集中第一列cols是用户,第二列cols是事件
                uniqueEvents.add(cols[1])
                eventsForUser[cols[0]].add(cols[1])#每个用户里有哪些事件
                usersForEvent[cols[1]].add(cols[0])#每个事件中有哪些用户
            f.close()

        #构造用户矩阵的分数,ss.dok_matrix构建好的空矩阵默认是用0.0来填充的
        #分数矩阵中，每一行表示一个唯一的用户，每一人列表示一个唯一的事件
        #每个唯一的用户对应一个或多个事件
        self.userEventScores = ss.dok_matrix((len(uniqueUsers) , len(uniqueEvents)))
        
        self.userIndex = dict()
        self.eventIndex = dict()
        #print(self.userEventScores.shape)
        #print(self.userEventScores.getcol(1))
        
        #数组uniqueUsers中index的位置就是上面构造的分数矩阵的行坐标
        for i , u in enumerate(uniqueUsers):
            self.userIndex[u] = i
        #数组uniqueEvents中index的位置就是上面构造的分数矩阵的列坐标
        for i , e in enumerate(uniqueEvents):
            self.eventIndex[e] = i
        
        #重读训练数据集
        ftrain = open('./recommendData/train_test.csv' , 'r')
        ftrain.readline()
        for line in ftrain:
            cols = line.strip().split(',')
            i = self.userIndex[cols[0]] #用户
            j = self.eventIndex[cols[1]] #事件
            #用用户对事件的感兴趣程度减去不感兴趣程序作为用户对事件本身的分数,具体值为1或0或-1
            self.userEventScores[i , j] = int(cols[4]) - int(cols[5])#interested , and not_interested
        ftrain.close()
        
        #存储已经计算好的分数矩阵，该矩阵就是关系矩阵R，要求的矩阵为关系矩阵R分解后的矩阵P和Q
        sio.mmwrite("PE_userEventScores" , self.userEventScores)
        
        # 为了防止不必要的计算，我们找出来所有关联的用户 或者 关联的event
        # 所谓的关联用户，指的是至少在同一个event上有行为的用户对
        # 关联的event指的是至少同一个user有行为的event对
        self.uniqueUserPairs = set()
        self.uniqueEventPairs = set()
        #self.uniqueUserPairs里的值示例为：('182290053', '93148987')，表示这两个用户参加过同一个事件
        #同理，self.uniqueEventPairs里的值表示某个用户参加过这两个事件
        for event in uniqueEvents:
            users = usersForEvent[event]
            if len(users) > 2:
                self.uniqueUserPairs.update(itertools.combinations(users , 2))
        
                
        for user in uniqueUsers:
            events = eventsForUser[user]
            if len(events) > 2:
                self.uniqueEventPairs.update(itertools.combinations(events , 2))
        
                
        pickle.dump(self.userIndex , open('PE_userIndex.pkl' , 'wb'))
        pickle.dump(self.eventIndex , open('PE_eventIndex.pkl' , 'wb'))
        
        
print("第1步：统计user和event相关信息\n")
pe = ProgramEntities()
print("第1步完成。\n")

3.用户与用户相似度矩阵

根据用户信息计算用户与用户相似度(非user based CF中的相似度)

class Users:
    #构建 user/user 相似度矩阵
    #sim为计算相似度的方法
    def __init__(self , programEntities , sim=ssd.correlation):
        cleaner = DataCleaner()
        #用户总数
        nusers = len(programEntities.userIndex.keys())
        
        fin = open('./recommendData/users.csv' , 'r')
        colnames = fin.readline().strip().split(',')
        print('文件格式为：\n' , colnames)
        self.userMatrix = ss.dok_matrix((nusers , len(colnames) - 1))
        
        rows = 0
        for line in fin:
            cols = line.strip().split(',')
            if rows < 6:
                print(cols)
            rows += 1
            
            # 只考虑train.csv中出现的用户
            if cols[0] in programEntities.userIndex:
                i = programEntities.userIndex[cols[0]]
                
                # 对user特征编码
                self.userMatrix[i , 0] = cleaner.getLocaleId(cols[1])
                self.userMatrix[i , 1] = cleaner.getBirthYearInt(cols[2])
                self.userMatrix[i , 2] = cleaner.getGenderId(cols[3])
                self.userMatrix[i , 3] = cleaner.getJoinedYearMonth(cols[4])
                self.userMatrix[i , 4] = cleaner.getCountryId(cols[5])
                self.userMatrix[i , 5] = cleaner.getTimezoneInt(cols[6])
        fin.close()
        
        # 归一化用户矩阵
        self.userMatrix = normalize(self.userMatrix , norm='l1' , axis=0 , copy=False)
        sio.mmwrite('US_userMatrix' , self.userMatrix)

        #计算用户相似度矩阵，之后会用到
        self.userSimMatrix = ss.dok_matrix((nusers , nusers))

        # 对角线元素
        for i in range(0 , nusers):
            self.userSimMatrix[i , i] = 1.0
        
        #对称
        for u1 , u2 in programEntities.uniqueUserPairs:
            i = programEntities.userIndex[u1]
            j = programEntities.userIndex[u2]
            # 相似度计算------该处用的是相关系数
            #todense函数返回矩阵的稠密形式
            if (i , j) not in self.userSimMatrix:
                usim = sim(self.userMatrix.getrow(i).todense() , self.userMatrix.getrow(j).todense())
                self.userSimMatrix[i , j] = usim
                self.userSimMatrix[j , i] = usim
        sio.mmwrite('US_userSimMatrix' , self.userSimMatrix)
        

print("第2步：计算用户相似度信息，并用矩阵形式存储\n")
Users(pe)
print("第2步完成。\n")

4.用户社交关系挖掘

class UserFriends:
    """
    找出某用户的那些朋友，想法非常简单
    1）如果你有更多的朋友，可能你性格外向，更容易参加各种活动
    2）如果你朋友会参加某个活动，可能你也会跟随去参加一下
    """
    
    def __init__(self , programEntities):
        nusers = len(programEntities.userIndex.keys())
        #每个用户的朋友总数
        self.numFriends = np.zeros((nusers))
        #用户社交关系矩阵
        self.userFriends = ss.dok_matrix((nusers , nusers))
        
        fin = open('./recommendData/user_friends_test.csv' , 'r')
        #字段：user ,friends
        colnames = fin.readline().strip().split(',')#skip header
        print('文件格式为：\n' , colnames)
        ln = 0
        rows = 1
        for line in fin:#对每个用户，每一行表示一个用户
            #打印文件的前五行数据
            if rows == 5:
                print(line.strip().split(','))
            rows += 1
            
            if ln % 200 == 0:#打印进展
                print("Loading line: " , ln)
            cols = line.strip().split(',')
            user = cols[0]
            if user in programEntities.userIndex:
                friends = cols[1].split(" ")#用户的多个朋友之间是以空格隔开的
                i = programEntities.userIndex[user]
                self.numFriends[i] = len(friends)
                for friend in friends: #用户的每个朋友
                    if friend in programEntities.userIndex:
                        j = programEntities.userIndex[friend]
                        
                        # the objective of this score is to infer the degree to 
                        # and direction in which this friend will influence the
                        # user's decision , so we sum the user/event score for
                        # this user across all training events.
                        
                        #userEventScores为用户对事件的打分(interested - not interseted)
                        eventsForUser = programEntities.userEventScores.getrow(j).todense()
                       
                        # 所有朋友参加活动的数量(平均频率)
                        # 如果用户的朋友都去参加某个事件，那么用户也有可能去参加该事件
                        score = eventsForUser.sum() / np.shape(eventsForUser)[1]
                        self.userFriends[i , j] += score
                        self.userFriends[j , i] += score
            ln += 1
        fin.close()
        
        # 归一化数组
        sumNumFriends = self.numFriends.sum(axis = 0)
        self.numFriends = self.numFriends / sumNumFriends
        sio.mmwrite('UF_numFriends' , np.matrix(self.numFriends))
        self.userFriends = normalize(self.userFriends , norm='l1' , axis=0 , copy=False)
        sio.mmwrite('UF_userFriends' , self.userFriends)
    
print("第3步：计算用户社交关系信息，并存储\n")
UserFriends(pe)
print("第3步完成。\n")

5.构造event和event相似度数据

class Events:
    """
    构建event-event相似度，注意这里有2种相似度;
    1）由用户-event行为，类似协同过滤算出的相似度
    2）由event本身的内容(event信息)计算出的event-event相似度
    """
    def __init__(self , programEntities , psim=ssd.correlation , csim=ssd.cosine):
        cleaner = DataCleaner()
        
        #event本身的内容
        fin = open("./recommendData/events_test.csv" , 'r')
       
        #字段：event_id, user_id , start_time, city , state , zip , country , lat , and lng , 101colums
        colnames = fin.readline().strip().split(',')#skip header
        print('文件格式为：\n' , colnames)
        nevents = len(programEntities.eventIndex.keys())
        
        self.eventPropMatrix = ss.dok_matrix((nevents , 7))
        self.eventContMatrix = ss.dok_matrix((nevents , 100))
        ln = 0
        rows = 1
        for line in fin.readlines():
#            if ln > 10
#                break;
            cols = line.strip().split(",")
            if rows < 6:
                print(cols)
            rows += 1
            
            eventId = cols[0]
            eventId = cols[0]
            if eventId in programEntities.eventIndex:
                i = programEntities.eventIndex[eventId]
                
                #event的特征编码
                self.eventPropMatrix[i , 0] = cleaner.getJoinedYearMonth(cols[2])#start_time
                self.eventPropMatrix[i , 1] = cleaner.getFeatureHash(cols[3])#city
                self.eventPropMatrix[i , 2] = cleaner.getFeatureHash(cols[4])#state
                self.eventPropMatrix[i , 3] = cleaner.getFeatureHash(cols[5])#zip
                self.eventPropMatrix[i , 4] = cleaner.getFeatureHash(cols[6])#country
                self.eventPropMatrix[i , 5] = cleaner.getFloatValue(cols[7])#lat
                self.eventPropMatrix[i , 6] = cleaner.getFloatValue(cols[8])#lon
                
                #词频
                for j in range(9 , 109):
                    self.eventContMatrix[i , j-9] = cols[j]
        fin.close()
        
        self.eventPropMatrix = normalize(self.eventPropMatrix , norm='l1' , axis=0 , copy=False)
        sio.mmwrite("EV_eventPropMatrix" , self.eventPropMatrix)
        
        self.eventContMatrix = normalize(self.eventContMatrix , norm='l1' , axis=0 , copy=False)
        sio.mmwrite("EV_eventContMatrix" , self.eventContMatrix)
        
        # calculate similarity between event pairs based on the two matrices
        self.eventPropSim = ss.dok_matrix((nevents , nevents))
        self.eventContSim = ss.dok_matrix((nevents , nevents))
        
        for e1 , e2 in programEntities.uniqueEventPairs:
            i = programEntities.eventIndex[e1]
            j = programEntities.eventIndex[e2]
            #相关系数计算相似度
            if (i , j) not in self.eventPropSim:
                epsim = psim(self.eventPropMatrix.getrow(i).todense(), self.eventPropMatrix.getrow(j).todense())
                self.eventPropSim[i , j] = epsim
                self.eventPropSim[j , i] = epsim
            #夹角余弦计算相似度
            if (i , j) not in self.eventContSim:
                ecsim = csim(self.eventContMatrix.getrow(i).todense() , self.eventContMatrix.getrow(j).todense())
                self.eventContSim[i , j] = epsim
                self.eventContSim[j , i] = epsim
        sio.mmwrite("EV_eventPropSim" , self.eventPropSim)
        sio.mmwrite("EV_eventContSim" , self.eventContSim)
            
            
print("第4步：计算event相似度信息，并用矩阵形式存储\n")
Events(pe)
print("第4步完成。\n")

6.活跃度/event热度数据

class EventAttendees():
    #统计某个活动，参加和不参加的人数，从而为活动活跃度做准备
    
    def __init__(self , programEvents):
        nevents = len(programEvents.eventIndex.keys())
        self.eventPopularity = ss.dok_matrix((nevents , 1))
        
        f = open("./recommendData/event_attendees.csv" , 'r')
        #字段：event_id , yes , maybe ,invited , and no
        colnames = f.readline().strip().split(',')#skip header
        print('文件格式为：\n' , colnames)
        rows = 1
        for line in f:
            cols = line.strip().split(",")
            if rows < 6:
                print(cols)
            rows += 1
            eventId = cols[0]
            if eventId in programEvents.eventIndex:
                i = programEvents.eventIndex[eventId]
                #yes - no
                self.eventPopularity[i , 0] = \
                    len(cols[1].split(" ")) - len(cols[4].split(" "))
        f.close()
                                                    
        self.eventPopularity = normalize(self.eventPopularity , norm='l1' , axis=0 , copy=False)
        sio.mmwrite("EA_eventPopularity" , self.eventPopularity)

print("第5步：计算event热度信息\n")
EventAttendees(pe)
print("第5步完成。\n")

7.串起所有的数据处理和准备流程

"""
def data_prepare():
    #计算生成所有的数据，用矩阵或者其他形式存储方便后续提取特征和建模
    
    print("第1步：统计user和event相关信息\n")
    pe = ProgramEntities()
    print("第1步完成。\n")
    
    print("第2步：计算用户相似度信息，并用矩阵形式存储\n")
    Users(pe)
    print("第2步完成。\n")
    
    print("第3步：计算用户社交关系信息，并存储\n")
    UserFriends(pe)
    print("第3步完成。\n")
    
    print("第4步：计算event相似度信息，并用矩阵形式存储\n")
    Events(pe)
    print("第4步完成。\n")
    
    print("第5步：计算event热度信息\n")
    EventAttendees(pe)
    print("第5步完成。\n")

#运行进行数据准备
data_prepare()
"""

8.构建特征

#这是构建特征部分
from __future__ import division

import pickle
import numpy as np
import scipy.io as sio
import math

#可以将多个模型的输出结果做为最终模型的一个输入
class DataRewriter:
    def __init__(self):
        #读入数据做初始化
        self.userIndex = pickle.load(open("PE_userIndex.pkl" , 'rb'))
        self.eventIndex = pickle.load(open("PE_eventIndex.pkl" , 'rb'))
        self.userEventScores = sio.mmread("PE_userEventScores").todense()
        self.userSimMatrix = sio.mmread("US_userSimMatrix").todense()
        self.eventPropSim = sio.mmread("EV_eventPropSim").todense()
        self.eventContSim = sio.mmread("EV_eventContSim").todense()
        self.numFriends = sio.mmread("UF_numFriends")
        self.userFriends = sio.mmread("UF_userFriends").todense()
        self.eventPopularity = sio.mmread("EA_eventPopularity").todense()
        
    def userReco(self , userId , eventId):
        """
        根据User-based协同过滤，得到event的推荐度
        基本的伪代码思路如下:
        for item i
            for every other user v that has a preference for i 
                computer similarity s between u and v
                incorporate v's preference for i weighted by s into running average
        return top items ranked by weighted average
        """
        #处理第i个用户对第j个事件的关系
        i = self.userIndex[userId]
        j = self.eventIndex[eventId]
        #vs取的第j列，即第j个事件的所有参加的用户
        vs = self.userEventScores[: , j]
        #sims表示第i行，即第i个用户和其它用户的相似度
        sims = self.userSimMatrix[i , :]
        #prod是行矩阵乘以列矩阵，因此是一个常数
        prod = sims * vs
        try:
            return prod[0 , 0] - self.userEventScores[i , j]#加权平均
        except IndexError:
            return 0
        
    def eventReco(self , userId , eventId):
        """
        根据基于物品的协同过滤，得到Event的推荐度
        基本的伪代码思路如下:
        for item i 
            for every item j the u has a preference for
                computer similarity s between i and j 
                and u's preference for j weighted by s to a running average
        return top items , ranked by weighted average
        """
        i = self.userIndex[userId]
        j = self.eventIndex[eventId]
        js = self.userEventScores[i , :]
        psim = self.eventPropSim[: , j]
        csim = self.eventContSim[: , j]
        
        #去掉矩阵中nan
        for jsIndex in range(js.shape[1]):
            if math.isnan(js[0 , jsIndex-1]):
                js[0 , jsIndex-1] = 0
        for psimIndex in range(psim.shape[0]):
            if math.isnan(psim[psimIndex-1 , 0]):
                psim[psimIndex-1 , 0] = 0
        for csimIndex in range(csim.shape[0]):
            if math.isnan(csim[csimIndex-1 , 0]):
                csim[csimIndex-1 , 0] = 0
        
        pprod = js * psim
        cprod = js * csim
       
        #print(psim.shape , csim.shape , js.shape)
        #基于物品的相似度有两个
        pscore = 0#基于事件描述特征的相似度
        cscore = 0#基于词频的相似度
        try:
            pscore = pprod[0 , 0] - self.userEventScores[i , j]
        except IndexError:
            pass
        try:
            cscore = cprod[0 , 0] -self.userEventScores[i , j]
        except IndexError:
            pass
        return pscore , cscore

    def userPop(self , userId):
        """
        基于用户的朋友个数来推断用户的社交程序
        主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动
        """
        if userId in self.userIndex:
            i = self.userIndex[userId]
            try:
                return self.numFriends[0 , i]
            except IndexError:
                return 0
        else:
            return 0

    def friendInfluence(self , userId):
        """
        朋友对用户的影响
        主要考虑用户所有的朋友中，有多少是非常喜欢参加各种社交活动/event的
        用户的朋友圈如果都积极参与种种event，可能会对当前用户有一定的影响
        """
        nusers = np.shape(self.userFriends)[1]
        i = self.userIndex[userId]
        return (self.userFriends[i , :].sum(axis=0) / nusers)[0 , 0]

    def eventPop(self , eventId):
        """
        本活动本身的热度
        主要是通过参与的人数来界定的
        """
        i = self.eventIndex[eventId]
        return self.eventPopularity[i , 0]
    
    def rewriteData(self , start=1 , train=True , header=True):
        """
        把前面user-based协同过滤和item-base协同过滤，以及各种热度和影响度作为特征组合在起
        生成新的训练数据，用于分类器分类使用
        """
        fn = "./recommendData/train_test.csv" if train else "./recommendData/test_test.csv"
        fin = open(fn , 'r')
        foutName = "./recommendData/data_train_test.csv" if train else "./recommendData/data_test_test.csv"
        fout = open(foutName , 'w')
        #write output header
        if header:
            ocolnames = ["invited" , "user_reco" , "evt_p_reco" , "evt_c_reco" , "user_pop" , "frnd_infl" , "evt_pop"]
            if train:
                ocolnames.append("interested")
                ocolnames.append("not_interested")
            fout.write(",".join(ocolnames) + "\n")
        ln = 0
        for line in fin:
            ln += 1
            if ln <start:
                continue
            cols = line.strip().split(",")
            userId = cols[0]
            eventId = cols[1]
            invited = cols[2]
            if ln%500 == 0:
                print("%s:%d (userId , eventId)=(%s , %s)" % (fn , ln , userId , eventId))
            user_reco = self.userReco(userId , eventId)
            evt_p_reco , evt_c_reco = self.eventReco(userId , eventId)
            user_pop = self.userPop(userId)
            frnd_infl = self.friendInfluence(userId)
            evt_pop = self.eventPop(eventId)
            ocols = [invited ,user_reco , evt_p_reco , evt_c_reco , user_pop , frnd_infl ,evt_pop]
            if train:
                ocols.append(cols[4])#interested
                ocols.append(cols[5])#not_interested
            fout.write(",".join(map(lambda x: str(x) , ocols)) + "\n")
        fin.close()
        fout.close()
        
    def rewriteTrainingSet(self):
        self.rewriteData(True)
        
    def rewriteTestSet(self):
        self.rewriteData(False)

#When running with cython ,the actual class will be converted to a .so
#file , and the following code (along with the commented out import below)
#will need to be put into another .py and this should be run.

#import CRegressionData as rd

dr = DataRewriter()
print("生成训练数据...\n")
dr.rewriteData(train=True , start=2 , header=True)

print("生成预测数据...\n")
dr.rewriteData(train=False , start=2 , header=True)

9.建模与预测

实际上在上述特征构造好了之后，可以用很多办法去训练得到模型和完成预测，这里用了sklearn中的SGDClassifier(lgoistic回归)，事实上xgboost有更好的效果注意交叉验证

#建模与预测
from __future__ import division

import math

import numpy as np
import pandas as pd

from sklearn.cross_validation import KFold
from sklearn.linear_model import SGDClassifier

def train():
    #在我们得到的特征上训练分类器，traget为1(感兴趣)，或者是0(不感兴趣)
    trainDf = pd.read_csv("./recommendData/data_train_test.csv")
    X = np.matrix(pd.DataFrame(trainDf , index=None , columns=["invited" , "user_reco" , "evt_p_reco" , "evt_c_reco" , "user_pop" , "frnd_infl" , "evt_pop"]))
    y = np.array(trainDf.interested)
    
    clf = SGDClassifier(loss="log" , penalty="l2")
    clf.fit(X , y)
    return clf

def validate():
    #10折的交叉验证，并输出交叉验证的平均准确率
    
    trainDf = pd.read_csv("./recommendData/data_train_test.csv")
    X = np.matrix(pd.DataFrame(trainDf , index=None , columns=["invited" , "user_reco" , "evt_p_reco" , "evt_c_reco" , "user_pop" , "frnd_infl" , "evt_pop"]))
    y = np.array(trainDf.interested)
    
    nrows = len(trainDf)
    kfold = KFold(nrows , 10)
    avgAccuracy = 0
    run = 0
    for train , test in kfold:
        Xtrain , Xtest , ytrain , ytest = X[train] , X[test] , y[train] , y[test]
        clf = SGDClassifier(loss="log" , penalty="l2")
        clf.fit(Xtrain , ytrain)
        
        accuracy = 0
        ntest = len(ytest)
        for i in range(0 , ntest):
            yt = clf.predict(Xtest[i , :])
            if yt == ytest[i]:
                accuracy += 1
        accuracy = accuracy / ntest
        
        print("accuracy (run %d): %f" % (run , accuracy))
        avgAccuracy += accuracy
        run += 1
    print("Average accuracy" , (avgAccuracy / run))
    
def test(clf):
    #读取test数据，用分类器完成预测
    origTestDf = pd.read_csv("./recommendData/test_test.csv")
    users = origTestDf.user
    events = origTestDf.event
    
    testDf = pd.read_csv("./recommendData/data_test_test.csv")
    fout = open("./recommendData/result.csv" , "w")
    fout.write(",".join(["user" , "event" , "outcome" , "dist"]) + "\n")
    nrows = len(testDf)
    Xp = np.matrix(testDf)
    yp = np.zeros((nrows , 2))
    for i in range(0 , nrows):
        xp = Xp[i , :]
        yp[i , 0] = clf.predict(xp)
        #decision_function()的功能：计算样本点到分割超平面的函数距离。 
        yp[i , 1] = clf.decision_function(xp)
        fout.write(",".join(map(lambda x: str(x), [users[i] , events[i] , yp[i , 0] , yp[i , 1]])) + "\n")
    fout.close()
    
clf = train()
test(clf)
#validate()

10.生成要提交的文件

"""
#处理成提交结果的格式
from __future__ import division

import pandas as pd

def byDist(x , y):
    return int(y[1] - x[1])

def generate_submition_file():
    #输出文件
    fout = open("./recommendData/final_result.csv" , "w")
    fout.write(",".join(["User" , "Events"]) + "\n")
    resultDf = pd.read_csv("./recommendData/result.csv")
    #group remaining user/events
    grouped = resultDf.groupby("user")
    print(grouped)
    for name , group in grouped:
        print(group)
        user = str(name)
        print(list(group.event))
        tuples = zip(list(group.event) , list(group.dist) , list(group.outcome))
        tuples = sorted(tuples ,cmp=byDist)
        events = "\"" + str(map(lambda x: x[0] , tuples)) + "\""
        fout.write(",".join([user , events]) + "\n")
    fout.close()
    
generate_submition_file()
"""