minte9
LearnRemember





Count Vectorizer

How can we represent texts as vectors?
How can we use multiple features?


Vectors Similarity

Any text can be represented as a vector. The word london occurs 2 times in A and 1 time in B. We can find the cos similarity between these two vectors.
 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from icecream import ic

# Sample text strings
A = 'London Paris London'
B = 'Paris Paris London'

# Create an instance of CountVectorizer
cv = CountVectorizer()

# Convert the text strings into a frequency matrix
# Each unique word becomes a feature (column) in the matrix
matrix = cv.fit_transform([A, B])

# Calculate the cosine similarity between the vectors
# Since there are two documents, this results in a 2x2 matrix
# Diagonal elements are the self-similarity scores (1.0)
# Off-diagonal elements are the cross-document similarity scores
similarity_scores = cosine_similarity(matrix)

ic(cv.get_feature_names_out())
ic(matrix)
ic(matrix.toarray())
ic(similarity_scores)

"""
    ic| cv.get_feature_names_out(): array(['london', 'paris'], dtype=object)
    ic| matrix: <2x2 sparse matrix of type '<class 'numpy.int64'>'
                    with 4 stored elements in Compressed Sparse Row format>
    ic| matrix.toarray(): array([[2, 1],
                                [1, 2]])
    ic| similarity_scores: array([[1. , 0.8],
                                [0.8, 1. ]])
"""

Combine Features

With scikit library count vectorizer we can represent texts as vectors. We can combine relevant features and compute similarity score. We can sort by similarity score x[1] in descending order. After sorting, we exclude the first element, which is the movie itself.
 
""" Knn / Movie recommendation system (scikit)
"""

import pathlib
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

DIR = pathlib.Path(__file__).resolve().parent
movies = pd.read_csv(DIR / 'data/movies_dataset2.csv')

# New combined feature
def combine_features(row):
    return str(row['genres']) + " " + str(row['cast']) + " " + str(row['keywords'])

movies['combined_features'] = movies.apply(combine_features, axis=1)

# Similarity
cv = CountVectorizer()
count_matrix = cv.fit_transform(movies['combined_features'])
cosine_similarity = cosine_similarity(count_matrix)

# Find the 10 most similiar movies
def predict_movies(name):
    index = movies[movies['original_title'].str.contains(name)].index[0]
    similar_movies = list(enumerate(cosine_similarity[index]))

    # Sort by score in descending order
    similar_sorted = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:]

    # Output results
    for neighbor in similar_sorted[:10]:
        movie_index = neighbor[0]
        data = movies.iloc[movie_index]

        original_title = data['original_title']
        genres = data['genres']
        rating = str(data['vote_average'])
        print(original_title + " | " + genres + " | Rating: " + rating) 
        
predict_movies('Avatar')

"""
    Guardians of the Galaxy | Action Science Fiction Adventure | Rating: 7.9
    Star Trek Into Darkness | Action Adventure Science Fiction | Rating: 7.4
    Star Trek Beyond | Action Adventure Science Fiction | Rating: 6.6
    Alien | Horror Action Thriller Science Fiction | Rating: 7.9
    Star Wars: Clone Wars (Volume 1) | Action ... Fiction | Rating: 8.0
    Planet of the Apes | Thriller Science Fiction Action Adventure | Rating: 5.6
    Moonraker | Action Adventure Thriller Science Fiction | Rating: 5.9
    Galaxy Quest | Comedy Family Science Fiction | Rating: 6.9
    Gravity | Science Fiction Thriller Drama | Rating: 7.3
    Jupiter Ascending | Science Fiction Fantasy Action Adventure | Rating: 5.2
"""

Movies Similarity

Calculate the similarity between two movies. A lower similarity score indicates greater similarity.
 
import numpy as np
import pandas as pd
import pickle
from scipy import spatial

import pathlib
DIR = pathlib.Path(__file__).resolve().parent

# Load the preprocessed data
with open(DIR / 'data/movies_processed.pkl', 'rb') as f:

    # Deserialize the data from the file and assign it to variable
    movies = pickle.load(f)

def similarity(a, b):
    """
    Calculate the similarity between two movies.

    Args:
    a (dict): A dictionary containing features of the first movie.
    b (dict): A dictionary containing features of the second movie.

    Returns:
    float: A similarity score, where a lower score indicates greater similarity.
    """

    # Calculate the cosine distance for the genres of the movies
    d1 = spatial.distance.cosine(a['genres_bin'], b['genres_bin'])

    # Calculate the cosine distance for the cast of the movies
    d2 = spatial.distance.cosine(a['cast_bin'], b['cast_bin'])

    # Calculate the cosine distance for the keywords of the movies
    d3 = spatial.distance.cosine(a['keywords_bin'], b['keywords_bin'])

    # Return the sum of the three distances as the overall similarity score
    return d1 + d2 + d3

def predict_similar_movies(movie_title):
    name = "Avatar" # input('Enter a movie title: ')
    new_movie = movies[movies['original_title'].str.contains(movie_title, case=False, regex=False)].iloc[0]
    print('\nSelected Movie: ', new_movie.original_title, "\n")

    # Convert new_movie to a series for efficiency
    new_movie_series = new_movie[['genres_bin', 'cast_bin', 'keywords_bin']]

    # Calculate distances in a vectorized way
    distances = movies.apply(lambda x: similarity(new_movie_series, x) 
                             if x['new_id'] != new_movie['new_id'] else np.inf, axis=1)

    # Get the 10 most similar movies
    nearest_neighbors = distances.nsmallest(11).index

    for neighbor_idx in nearest_neighbors:
        if neighbor_idx != new_movie.name:  # Exclude the selected movie itself
            neighbor = movies.loc[neighbor_idx]
            print(f"{neighbor['original_title']} | Genres: {neighbor['genres']} | Rating: {neighbor['vote_average']}")

predict_similar_movies("Avatar")
predict_similar_movies("Titanic")
predict_similar_movies("Star Trek")

"""
    Selected Movie:  Avatar 

    Star Trek Into Darkness | Genres: ['Action', 'Adventure', 'ScienceFiction'] | Rating: 7.4
    Jupiter Ascending | Genres: ['ScienceFiction', 'Fantasy', 'Action', 'Adventure'] | Rating: 5.2
    Guardians of the Galaxy | Genres: ['Action', 'ScienceFiction', 'Adventure'] | Rating: 7.9
    Clash of the Titans | Genres: ['Adventure', 'Fantasy', 'Action'] | Rating: 5.6
    John Carter | Genres: ['Action', 'Adventure', 'ScienceFiction'] | Rating: 6.1
    Pirates of the Caribbean: On Stranger Tides | Genres: ['Adventure', 'Action', 'Fantasy'] | Rating: 6.4
    The Fifth Element | Genres: ['Adventure', 'Fantasy', 'Action', 'Thriller', 'ScienceFiction'] | Rating: 7.3
    The Time Machine | Genres: ['ScienceFiction', 'Adventure', 'Action'] | Rating: 5.8
    Superman Returns | Genres: ['Adventure', 'Fantasy', 'Action', 'ScienceFiction'] | Rating: 5.4
    Man of Steel | Genres: ['Action', 'Adventure', 'Fantasy', 'ScienceFiction'] | Rating: 6.5
    X-Men: Days of Future Past | Genres: ['Action', 'Adventure', 'Fantasy', 'ScienceFiction'] | Rating: 7.5

    Selected Movie:  Titanic 

    Revolutionary Road | Genres: ['Drama', 'Romance'] | Rating: 6.7
    The Great Gatsby | Genres: ['Drama', 'Romance'] | Rating: 7.3
    Romeo + Juliet | Genres: ['Drama', 'Romance'] | Rating: 6.7
    Iris | Genres: ['Drama', 'Romance'] | Rating: 6.2
    The Beach | Genres: ['Drama', 'Adventure', 'Romance', 'Thriller'] | Rating: 6.3
    All the King's Men | Genres: ['Drama', 'Thriller'] | Rating: 5.7
    The Reader | Genres: ['Drama', 'Romance'] | Rating: 7.2
    Sense and Sensibility | Genres: ['Drama', 'Romance'] | Rating: 7.2
    Little Children | Genres: ['Romance', 'Drama'] | Rating: 6.9
    What's Eating Gilbert Grape | Genres: ['Romance', 'Drama'] | Rating: 7.5
    Cruel Intentions | Genres: ['Drama', 'Romance', 'Thriller'] | Rating: 6.6

    Selected Movie:  Star Trek Into Darkness 

    Star Trek Beyond | Genres: ['Action', 'Adventure', 'ScienceFiction'] | Rating: 6.6
    Star Trek | Genres: ['ScienceFiction', 'Action', 'Adventure'] | Rating: 7.4
    Avatar | Genres: ['Action', 'Adventure', 'Fantasy', 'ScienceFiction'] | Rating: 7.2
    Transformers: Age of Extinction | Genres: ['ScienceFiction', 'Action', 'Adventure'] | Rating: 5.8
    Guardians of the Galaxy | Genres: ['Action', 'ScienceFiction', 'Adventure'] | Rating: 7.9
    Captain America: Civil War | Genres: ['Adventure', 'Action', 'ScienceFiction'] | Rating: 7.1
    Oblivion | Genres: ['Action', 'ScienceFiction', 'Adventure', 'Mystery'] | Rating: 6.4
    Pacific Rim | Genres: ['Action', 'ScienceFiction', 'Adventure'] | Rating: 6.7
    Avengers: Age of Ultron | Genres: ['Action', 'Adventure', 'ScienceFiction'] | Rating: 7.3
    Riddick | Genres: ['ScienceFiction', 'Action', 'Thriller'] | Rating: 6.2
    Ender's Game | Genres: ['ScienceFiction', 'Action', 'Adventure'] | Rating: 6.6
"""



  Last update: 301 days ago