thatanalyst.net Home Recommender System
Post
Cancel

Recommender System

Language :

INTRODUCTION

Built a movie recommendation system and deployed the end-to-end product in Heroku, with calls to ‘TMDB’ API for posters of recommendations.


1
2
3
4
import pandas as pd
import ast
from numpy.core.numeric import NaN
import requests


1
2
moviesComplete = pd.read_csv('tmdb_5000_movies.csv')
print(moviesComplete.columns)

The dataset containes 4803 movies, with the following columns:

1
2
3
4
5
6
# Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
#        'original_title', 'overview', 'popularity', 'production_companies',
#        'production_countries', 'release_date', 'revenue', 'runtime',
#        'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
#        'vote_count'],
#        dtype='object')


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# Ouput = -1

def getPoster(movieId):
    url_base = 'https://api.themoviedb.org/3/movie/{}?api_key=[API_KEY]&language=en-US'
    url = url_base.format(movieId)
    response = requests.get(url)
    data = response.json()
    if 'success' in data.keys():
      if data['success'] == False:
        return -1
    if 'poster_path' not in data.keys():
      return -1
    if type(data['poster_path']) != str:
      return -1
    if len(data['poster_path']) == 0:
      return -1
    path = 'https://image.tmdb.org/t/p/w500' + data['poster_path']
    return path

# Demonstration:
getPoster(12)
''' Output: 
    https://image.tmdb.org/t/p/w500/eHuGQ10FUzK1mdOY69wF5pGgEf5.jpg 
'''


1
2
3
4
5
6
7
8
9
10
11
moviesComplete['poster'] = moviesComplete.apply(lambda row: getPoster(row.id), axis=1)
moviesComplete = moviesComplete[moviesComplete['poster'] != -1]
movies = moviesComplete[['id', 'title', 'genres', 'vote_average', 'overview', 'poster']]
movies['over'] = movies['overview']
movies.head(1)
# This

'''
        id  	title	genres	                                                vote_average	overview	                                        poster	                                                       over
0	19995	Avatar	[{"id": 28, "name": "Action"}, {"id": 12, "nam...	7.2	        In the 22nd century, a paraplegic Marine is di...	https://image.tmdb.org/t/p/w500/jRXYjXNq0Cs2Tc...	       In the 22nd century, a paraplegic Marine is di...
'''


1
2
3
4
5
6
7
8
9
10
11
12
def strToList(var):
  genres = []
  for item in ast.literal_eval(var):
    genres.append(item['name'])
  return genres
genresList = movies['genres'].apply(strToList)

# Demonstration:
# This function converts - 
''' [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}] '''
# To - 
''' ['Action', 'Adventure', 'Fantasy', 'Science Fiction'] '''


1
2
3
4
5
6
7
8
movies.loc[:, 'genres'] = genresList
movies.head(1)
# This

'''
        id  	title	genres	                                                vote_average	overview	                                        poster	                                                       over
0	19995	Avatar	[Action, Adventure, Fantasy, Science Fiction]	        7.2             In the 22nd century, a paraplegic Marine is di...	https://image.tmdb.org/t/p/w500/jRXYjXNq0Cs2Tc...	       In the 22nd century, a paraplegic Marine is di...
'''


1
2
import nltk
nltk.download('stopwords')


1
2
3
4
5
6
from nltk.tokenize import RegexpTokenizer

def removePunc(text):
  text = text.lower()
  tokenizer = RegexpTokenizer(r'\w+')
  return tokenizer.tokenize(text)


1
2
3
4
5
6
7
from nltk.corpus import stopwords
from collections import Counter
stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

def removeStop(wordList):
  return [word for word in wordList if word not in stopwords_dict]


1
2
3
4
5
6
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize 
ps = PorterStemmer()

def stemWords(wordList):
  return [ps.stem(word) for word in wordList]


1
2
3
4
5
6
7
8
9
def puncStopStem(text):
  return ' '.join(stemWords(removeStop(removePunc(text))))
movies.loc[:, 'overview'] = movies['overview'].apply(puncStopStem)

'''
In the 22nd century, a paraplegic Marine is di...
# To
22nd centuri parapleg marin dispatch moon pand...
'''	


1
2
3
4
5
6
7
8
9
10
11
12
13
#vectorizatiom
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000)
vectors = cv.fit_transform(movies['overview']).toarray()

''' Example:
['advanc',
 'advantag',
 'adventur',
 'advertis',
 'advic',
 'advis',...]
 '''


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)
simdf = DataFrame(similarity)
print(simdf.shape)
simdf.head()

'''
(4734, 4734)
0	1	        2	        3	    4	        5         	6	        7       	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	...	4694	4695	4696	4697	4698	4699	4700	4701	4702	4703	4704	4705	4706	4707	4708	4709	4710	4711	4712	4713	4714	4715	4716	4717	4718	4719	4720	4721	4722	4723	4724	4725	4726	4727	4728	4729	4730	4731	4732	4733
0	1.000000	0.000000	0.00000	    0.105409	0.046374	0.043033	0.034816	0.045644	0.000000	0.000000	0.000000	0.043644	0.000000	0.000000	0.052705	0.000000	0.000000	0.000000	0.032026	0.000000	0.029424	0.000000	0.000000	0.00000	0.000000	0.000000	0.047946	0.105409	0.000000	0.036155	0.000000	0.074536	0.000000	0.000000	0.0	0.000000	0.078750	0.000000	0.031083	0.000000	...	0.000000	0.000000	0.00000	0.0	0.0	0.000000	0.000000	0.0	0.000000	0.0	0.000000	0.0	0.044281	0.0	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.00	0.000000	0.038069	0.000000	0.000000	0.031311	0.043644	0.00000	0.0	0.0	0.000000	0.000000	0.000000	0.069007	0.045644	0.000000	0.0	0.0	0.036886	0.000000
1	0.000000	1.000000	0.05164	    0.000000	0.046374	0.000000	0.000000	0.045644	0.000000	0.000000	0.000000	0.000000	0.071611	0.035466	0.052705	0.040324	0.049690	0.040324	0.128103	0.040324	0.029424	0.000000	0.062622	0.00000	0.000000	0.029050	0.047946	0.035136	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.133333	0.039375	0.000000	0.062167	0.000000	...	0.000000	0.023281	0.00000	0.2	0.0	0.079894	0.000000	0.0	0.045644	0.0	0.000000	0.0	0.000000	0.0	0.0	0.087287	0.059235	0.093250	0.092748	0.095893	0.00	0.074536	0.076139	0.028172	0.064550	0.031311	0.000000	0.00000	0.0	0.0	0.059235	0.046374	0.000000	0.034503	0.000000	0.000000	0.0	0.0	0.036886	0.000000
2	0.000000	0.051640	1.00000	    0.000000	0.000000	0.033333	0.000000	0.035355	0.044721	0.032026	0.000000	0.270449	0.000000	0.027472	0.000000	0.000000	0.038490	0.093704	0.124035	0.000000	0.000000	0.000000	0.048507	0.04714	0.000000	0.000000	0.037139	0.027217	0.000000	0.196039	0.000000	0.000000	0.000000	0.050000	0.0	0.154919	0.030500	0.000000	0.072232	0.000000	...	0.000000	0.000000	0.00000	0.0	0.0	0.020628	0.000000	0.0	0.106066	0.0	0.000000	0.0	0.034300	0.0	0.0	0.000000	0.000000	0.024077	0.035921	0.000000	0.05	0.000000	0.029488	0.021822	0.000000	0.024254	0.033806	0.04264	0.0	0.0	0.000000	0.000000	0.106904	0.000000	0.000000	0.022942	0.0	0.0	0.028571	0.000000
3	0.105409	0.000000	0.00000	    1.000000	0.024441	0.045361	0.073398	0.024056	0.060858	0.152535	0.024845	0.000000	0.000000	0.000000	0.000000	0.042505	0.026189	0.000000	0.016879	0.021253	0.015508	0.027778	0.000000	0.00000	0.087039	0.076553	0.025270	0.000000	0.032075	0.057166	0.046676	0.039284	0.057735	0.000000	0.0	0.000000	0.041505	0.030429	0.081912	0.025717	...	0.023689	0.049081	0.03637	0.0	0.0	0.014036	0.045361	0.0	0.072169	0.0	0.054433	0.0	0.070014	0.0	0.0	0.023002	0.000000	0.016382	0.024441	0.000000	0.00	0.039284	0.000000	0.044544	0.000000	0.099015	0.000000	0.00000	0.0	0.0	0.000000	0.024441	0.000000	0.054554	0.000000	0.000000	0.0	0.0	0.038881	0.086066
4	0.046374	0.046374	0.00000     0.024441	1.000000	0.000000	0.024218	0.063500	0.000000	0.057520	0.032791	0.000000	0.049814
'''


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
numcheck = simdf.to_numpy()
votes = movies[['vote_average']]
multi = votes**2
multi = list(multi['vote_average'])
simdf = simdf*multi
similarity2 = simdf.to_numpy()


def recoRating(movieIndex):
  top_index_tuple = sorted(list(enumerate(similarity2[movieIndex])), 
               reverse=True, key = (lambda x:x[1]))[1:200]
  same_genre_indices = []
  mainGenre = sorted(movies.iloc[movieIndex][2])

  for i in top_index_tuple:
    compareGenre = sorted(movies.iloc[i[0]][2])
    simcount = 0
    for genre in mainGenre:
      if genre in compareGenre:
        simcount += 1
    if len(mainGenre) - simcount <= 1:
      same_genre_indices.append(i)
      if len(same_genre_indices) == 5:
        break
  

  if len(same_genre_indices) < 5:
    same_genre_indices = top_index_tuple[1:6]

  #top_index = [x[0] for x in top_index[:5]]
  top_index = [x[0] for x in same_genre_indices]
  top_titles = [movies.iloc[x]['title'] for x in top_index]
  top_ids = [movies.iloc[x]['id'] for x in top_index]
  top_ovr = [movies.iloc[x]['overview'] for x in top_index]
  top_pos = [movies.iloc[x]['poster'] for x in top_index]

  topList = []
  for i in range(5):
    topList.append([top_index[i], top_titles[i], top_ids[i], top_ovr[i], top_pos[i]])
  return topList


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
recoRating(423)

# Demonstration:
'''
[[4221,
  'Road Hard',
  291362,
  'https://image.tmdb.org/t/p/w500/aClpHJtwN2I6DmCZpu54LUVoQJK.jpg'],
 [809,
  'Forrest Gump',
  13,
  'https://image.tmdb.org/t/p/w500/saHP97rTPS5eLmrLQEcANmKrsFl.jpg'],
 [2654,
  'Groundhog Day',
  137,
  'https://image.tmdb.org/t/p/w500/gCgt1WARPZaXnq523ySQEUKinCs.jpg'],
 [4226,
  'Me You and Five Bucks',
  361505,
  'https://image.tmdb.org/t/p/w500/ukQf5WWnLerLsMR1bo33HmQ6EUG.jpg'],
 [2602,
  'Brazil',
  68,
  'https://image.tmdb.org/t/p/w500/d0PibPzCK4fVikjoD1PqHovbvkt.jpg']]
'''


1


1


Please give the webapp a few seconds to run on Heroku servers. Thank you for your patience.

This post is licensed under CC BY 4.0 by the author.