import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
df = pd.read_csv('u.data', sep="\t")
columns = ['userid', 'item_id', 'rating', 'timestamp']
df.columns = columns
movietitles = pd.read_csv('Movie_Id_Titles')
df = pd.merge(df, movietitles, on='item_id')
df.groupby('title')['rating'].mean()
ratings = pd.DataFrame(df.groupby('title')['rating'].mean())
ratings['number_of_ratings'] = df.groupby('title')['rating'].count()
sns.jointplot(x="rating", y="number_of_ratings", data=ratings, alpha=0.5)
movie_matrix = df.pivot_table(index="userid", columns="title", values="rating")
movie_matrix
ratings.sort_values('number_of_ratings', ascending=False).head(10)
starwars_user_ratings = movie_matrix['Star Wars (1977)']
liarliar_user_ratings = movie_matrix['Liar Liar (1997)']
movie_matrix.corrwith(starwars_user_ratings)
similar_to_starwars = movie_matrix.corrwith(starwars_user_ratings)
corr_starwars = pd.DataFrame(similar_to_starwars, columns=['Correlation'])
corr_starwars.dropna(inplace=True)
corr_starwars.head()
corr_starwars = corr_starwars.join(ratings['number_of_ratings'])
corr_starwars
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
# 1. Read in the data, change separator to tab
# 2. Create an array of new column names
# 3. Assign that array to our columns
df = pd.read_csv('u.data', sep="\t")
columns = ['user_id', 'item_id', 'rating', 'timestamp']
df.columns = columns
df.head()
# 1. Read in the item_id to movie_title df
# 2. Merge
movietitles = pd.read_csv('Movie_Id_Titles')
df = pd.merge(df, movietitles, on="item_id")
ratings = pd.DataFrame(df.groupby('title')['rating'].mean())
ratings.head()
ratings['number_of_ratings'] = df.groupby('title')['rating'].count()
ratings.head()
# 1. Make a matrix (like an excel pivot table)
movie_matrix = df.pivot_table(index="user_id", columns="title", values="rating")
movie_matrix.head()
# 2. Pick our movies
# 3. Get the ratings for only those movies
starwars_ratings = movie_matrix['Star Wars (1977)']
liarliar_ratings = movie_matrix['Liar Liar (1997)']
similar_to_starwars = movie_matrix.corrwith(starwars_ratings)
corr_with_starwars = pd.DataFrame(similar_to_starwars, columns=['correlation'])
corr_with_starwars
sw_df = corr_with_starwars.join(ratings['number_of_ratings'])
sw_df = sw_df[sw_df['number_of_ratings'] >= 100 ]
sw_df = sw_df.sort_values('correlation', ascending=False)
sw_df
df
liar_liar = df[df['title'] == 'Liar Liar (1997)']
liar_liar
df_matrix = df.pivot_table(index="user_id", columns="title", values="rating")
df_matrix
df_m_sw = df_matrix['Star Wars (1977)']
df_m_sw_corr = df_matrix.corrwith(df_m_sw)
df_m_sw_corr = pd.DataFrame(df_m_sw_corr, columns=['correlation'])
df_m_sw_corr
df_m_sw_corr_with_ratings = df_m_sw_corr.join(ratings)
df_m_sw_corr_with_ratings
df_m_sw_corr_with_ratings = df_m_sw_corr_with_ratings[df_m_sw_corr_with_ratings['number_of_ratings'] >= 100]
df_m_sw_corr_with_ratings.sort_values('correlation',ascending=False)
# 1. Make a matrix
df_matrix = df.pivot_table(index="user_id", columns="title", values="rating")
# 2. Get JUST Liar Liar
df_matrix_ll = df_matrix['Liar Liar (1997)']
# 3. Compare everything in the matrix to Liar Liar column
df_matrix_ll_corr = df_matrix.corrwith(df_matrix_ll)
# 4. Turn that into a df so we can add things like 'ratings'
df_matrix_ll_corr_df = pd.DataFrame(df_matrix_ll_corr, columns=['correlation'])
# 5. Using a join (because our index -- title -- is the same for both dfs, add ratings)
df_matrix_ll_corr_df_with_ratings = df_matrix_ll_corr_df.join(ratings)
# 6. Remove anything with number of ratings < 100
df_matrix_ll_corr_df_with_ratings = df_matrix_ll_corr_df_with_ratings[df_matrix_ll_corr_df_with_ratings
['number_of_ratings'] >= 100]
# 7. Sort
df_matrix_ll_corr_df_with_ratings_sorted = df_matrix_ll_corr_df_with_ratings.sort_values('correlation',
ascending=False)
df_matrix_ll_corr_df_with_ratings_sorted
# IN SHORT
dfm = df.pivot_table(index="user_id", columns="title", values="rating")
ll = dfm['Liar Liar (1997)']
dfm_ll = pd.DataFrame(dfm.corrwith(ll), columns=['correlation'])
dfm_ll = dfm_ll.join(ratings)
dfm_ll = dfm_ll[dfm_ll['number_of_ratings'] >= 100].sort_values('correlation', ascending=False)
dfm_ll