数据挖掘之协同过滤(1)

发布时间:2021-12-04 付费文章:2.0元

 

据挖掘之协同过滤(1)

 

import pandas as pd

r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('数据集见本页底', sep='\t', names=r_cols, usecols=range(3), encoding="ISO-8859-1")

m_cols = ['movie_id', 'title']
movies = pd.read_csv('数据集见本页底', sep='|', names=m_cols, usecols=range(2), encoding="ISO-8859-1")

ratings = pd.merge(movies, ratings)
ratings.head()
  movie_id title user_id rating
movieRatings = ratings.pivot_table(index=['user_id'],columns=['title'],values='rating')
movieRatings.head()
starWarsRatings = movieRatings['Star Wars (1977)']
starWarsRatings.head()
user_id
0    5.0
1    5.0
2    5.0
3    NaN
4    5.0
Name: Star Wars (1977), dtype: float64
similarMovies = movieRatings.corrwith(starWarsRatings)
similarMovies = similarMovies.dropna()
df = pd.DataFrame(similarMovies)
df.head(10)
C:\Users\Frank\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\numpy\lib\function_base.py:2487: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
C:\Users\Frank\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\numpy\lib\function_base.py:2496: RuntimeWarning: divide by zero encountered in double_scalars
  c *= 1. / np.float64(fact)

 

similarMovies.sort_values(ascending=False)
title
No Escape (1994)                                                                     1.000000
Man of the Year (1995)                                                               1.000000
Hollow Reed (1996)                                                                   1.000000
Commandments (1997)                                                                  1.000000
Cosi (1996)                                                                          1.000000
Stripes (1981)                                                                       1.000000
Golden Earrings (1947)                                                               1.000000
Mondo (1996)                                                                         1.000000
Line King: Al Hirschfeld, The (1996)                                                 1.000000
Outlaw, The (1943)                                                                   1.000000
Hurricane Streets (1998)                                                             1.000000
Scarlet Letter, The (1926)                                                           1.000000
Safe Passage (1994)                                                                  1.000000
Good Man in Africa, A (1994)                                                         1.000000
Full Speed (1996)                                                                    1.000000
Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991)    1.000000
Star Wars (1977)                                                                     1.000000
Ed's Next Move (1996)                                                                1.000000
Twisted (1996)                                                                       1.000000
Beans of Egypt, Maine, The (1994)                                                    1.000000
Last Time I Saw Paris, The (1954)                                                    1.000000
Maya Lin: A Strong Clear Vision (1994)                                               1.000000
Designated Mourner, The (1997)                                                       0.970725
Albino Alligator (1996)                                                              0.968496
Angel Baby (1995)                                                                    0.962250
Prisoner of the Mountains (Kavkazsky Plennik) (1996)                                 0.927173
Love in the Afternoon (1957)                                                         0.923381
'Til There Was You (1997)                                                            0.872872
A Chef in Love (1996)                                                                0.868599
Guantanamera (1994)                                                                  0.866025
                                                                                       ...   
Pushing Hands (1992)                                                                -1.000000
Lamerica (1994)                                                                     -1.000000
Year of the Horse (1997)                                                            -1.000000
Collectionneuse, La (1967)                                                          -1.000000
Dream Man (1995)                                                                    -1.000000
S.F.W. (1994)                                                                       -1.000000
Nightwatch (1997)                                                                   -1.000000
Squeeze (1996)                                                                      -1.000000
Glass Shield, The (1994)                                                            -1.000000
Slingshot, The (1993)                                                               -1.000000
Lover's Knot (1996)                                                                 -1.000000
Tough and Deadly (1995)                                                             -1.000000
Sliding Doors (1998)                                                                -1.000000
Show, The (1995)                                                                    -1.000000
Nil By Mouth (1997)                                                                 -1.000000
Fall (1997)                                                                         -1.000000
Sudden Manhattan (1996)                                                             -1.000000
Salut cousin! (1996)                                                                -1.000000
Neon Bible, The (1995)                                                              -1.000000
Crossfire (1947)                                                                    -1.000000
Love and Death on Long Island (1997)                                                -1.000000
For Ever Mozart (1996)                                                              -1.000000
Swept from the Sea (1997)                                                           -1.000000
Fille seule, La (A Single Girl) (1995)                                              -1.000000
American Dream (1990)                                                               -1.000000
Theodore Rex (1995)                                                                 -1.000000
I Like It Like That (1994)                                                          -1.000000
Two Deaths (1995)                                                                   -1.000000
Roseanna's Grave (For Roseanna) (1997)                                              -1.000000
Frankie Starlight (1995)                                                            -1.000000
dtype: float64

 

import numpy as np
movieStats = ratings.groupby('title').agg({'rating': [np.size, np.mean]})
movieStats.head()
  rating
  size mean
title    

 

popularMovies = movieStats['rating']['size'] >= 100
movieStats[popularMovies].sort_values([('rating', 'mean')], ascending=False)[:15]
df = movieStats[popularMovies].join(pd.DataFrame(similarMovies, columns=['similarity']))
C:\Users\Frank\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\tools\merge.py:536: UserWarning: merging between different levels can give an unintended result (2 levels on the left, 1 on the right)
  warnings.warn(msg, UserWarning)
df.head()
  (rating, size) (rating, mean) similarity
title      

 

df.sort_values(['similarity'], ascending=False)[:15]