Supervised ml Count Vectorizer

Count Vectorizer

We represent texts as vectors using count vectorizer.
We combine features and compute similarity score.

How can we represent texts as vectors?
How can we use multiple features?

Vectors Similarity

Any text can be represented as a vector. The word london occurs 2 times in A and 1 time in B. We can find the cos similarity between these two vectors.

 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from icecream import ic

# Sample text strings
A = 'London Paris London'
B = 'Paris Paris London'

# Create an instance of CountVectorizer
cv = CountVectorizer()

# Convert the text strings into a frequency matrix
# Each unique word becomes a feature (column) in the matrix
matrix = cv.fit_transform([A, B])

# Calculate the cosine similarity between the vectors
# Since there are two documents, this results in a 2x2 matrix
# Diagonal elements are the self-similarity scores (1.0)
# Off-diagonal elements are the cross-document similarity scores
similarity_scores = cosine_similarity(matrix)

ic(cv.get_feature_names_out())
ic(matrix)
ic(matrix.toarray())
ic(similarity_scores)

"""
    ic| cv.get_feature_names_out(): array(['london', 'paris'], dtype=object)
    ic| matrix: <2x2 sparse matrix of type '<class 'numpy.int64'>'
                    with 4 stored elements in Compressed Sparse Row format>
    ic| matrix.toarray(): array([[2, 1],
                                [1, 2]])
    ic| similarity_scores: array([[1. , 0.8],
                                [0.8, 1. ]])
"""

Combine Features

With scikit library count vectorizer we can represent texts as vectors. We can combine relevant features and compute similarity score. We can sort by similarity score x[1] in descending order. After sorting, we exclude the first element, which is the movie itself.

 
""" Knn / Movie recommendation system (scikit)
"""

import pathlib
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

DIR = pathlib.Path(__file__).resolve().parent
movies = pd.read_csv(DIR / 'data/movies_dataset2.csv')

# New combined feature
def combine_features(row):
    return str(row['genres']) + " " + str(row['cast']) + " " + str(row['keywords'])

movies['combined_features'] = movies.apply(combine_features, axis=1)

# Similarity
cv = CountVectorizer()
count_matrix = cv.fit_transform(movies['combined_features'])
cosine_similarity = cosine_similarity(count_matrix)

# Find the 10 most similiar movies
def predict_movies(name):
    index = movies[movies['original_title'].str.contains(name)].index[0]
    similar_movies = list(enumerate(cosine_similarity[index]))

    # Sort by score in descending order
    similar_sorted = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:]

    # Output results
    for neighbor in similar_sorted[:10]:
        movie_index = neighbor[0]
        data = movies.iloc[movie_index]

        original_title = data['original_title']
        genres = data['genres']
        rating = str(data['vote_average'])
        print(original_title + " | " + genres + " | Rating: " + rating) 
        
predict_movies('Avatar')

"""
    Guardians of the Galaxy | Action Science Fiction Adventure | Rating: 7.9
    Star Trek Into Darkness | Action Adventure Science Fiction | Rating: 7.4
    Star Trek Beyond | Action Adventure Science Fiction | Rating: 6.6
    Alien | Horror Action Thriller Science Fiction | Rating: 7.9
    Star Wars: Clone Wars (Volume 1) | Action ... Fiction | Rating: 8.0
    Planet of the Apes | Thriller Science Fiction Action Adventure | Rating: 5.6
    Moonraker | Action Adventure Thriller Science Fiction | Rating: 5.9
    Galaxy Quest | Comedy Family Science Fiction | Rating: 6.9
    Gravity | Science Fiction Thriller Drama | Rating: 7.3
    Jupiter Ascending | Science Fiction Fantasy Action Adventure | Rating: 5.2
"""

Movies Similarity

Calculate the similarity between two movies. A lower similarity score indicates greater similarity.

 
import numpy as np
import pandas as pd
import pickle
from scipy import spatial

import pathlib
DIR = pathlib.Path(__file__).resolve().parent

# Load the preprocessed data
with open(DIR / 'data/movies_processed.pkl', 'rb') as f:

    # Deserialize the data from the file and assign it to variable
    movies = pickle.load(f)

def similarity(a, b):
    """
    Calculate the similarity between two movies.

    Args:
    a (dict): A dictionary containing features of the first movie.
    b (dict): A dictionary containing features of the second movie.

    Returns:
    float: A similarity score, where a lower score indicates greater similarity.
    """

    # Calculate the cosine distance for the genres of the movies
    d1 = spatial.distance.cosine(a['genres_bin'], b['genres_bin'])

    # Calculate the cosine distance for the cast of the movies
    d2 = spatial.distance.cosine(a['cast_bin'], b['cast_bin'])

    # Calculate the cosine distance for the keywords of the movies
    d3 = spatial.distance.cosine(a['keywords_bin'], b['keywords_bin'])

    # Return the sum of the three distances as the overall similarity score
    return d1 + d2 + d3

def predict_similar_movies(movie_title):
    name = "Avatar" # input('Enter a movie title: ')
    new_movie = movies[movies['original_title'].str.contains(movie_title, case=False, regex=False)].iloc[0]
    print('\nSelected Movie: ', new_movie.original_title, "\n")

    # Convert new_movie to a series for efficiency
    new_movie_series = new_movie[['genres_bin', 'cast_bin', 'keywords_bin']]

    # Calculate distances in a vectorized way
    distances = movies.apply(lambda x: similarity(new_movie_series, x) 
                             if x['new_id'] != new_movie['new_id'] else np.inf, axis=1)

    # Get the 10 most similar movies
    nearest_neighbors = distances.nsmallest(11).index

    for neighbor_idx in nearest_neighbors:
        if neighbor_idx != new_movie.name:  # Exclude the selected movie itself
            neighbor = movies.loc[neighbor_idx]
            print(f"{neighbor['original_title']} | Genres: {neighbor['genres']} | Rating: {neighbor['vote_average']}")

predict_similar_movies("Avatar")
predict_similar_movies("Titanic")
predict_similar_movies("Star Trek")

"""
    Selected Movie:  Avatar 

    Star Trek Into Darkness | Genres: ['Action', 'Adventure', 'ScienceFiction'] | Rating: 7.4
    Jupiter Ascending | Genres: ['ScienceFiction', 'Fantasy', 'Action', 'Adventure'] | Rating: 5.2
    Guardians of the Galaxy | Genres: ['Action', 'ScienceFiction', 'Adventure'] | Rating: 7.9
    Clash of the Titans | Genres: ['Adventure', 'Fantasy', 'Action'] | Rating: 5.6
    John Carter | Genres: ['Action', 'Adventure', 'ScienceFiction'] | Rating: 6.1
    Pirates of the Caribbean: On Stranger Tides | Genres: ['Adventure', 'Action', 'Fantasy'] | Rating: 6.4
    The Fifth Element | Genres: ['Adventure', 'Fantasy', 'Action', 'Thriller', 'ScienceFiction'] | Rating: 7.3
    The Time Machine | Genres: ['ScienceFiction', 'Adventure', 'Action'] | Rating: 5.8
    Superman Returns | Genres: ['Adventure', 'Fantasy', 'Action', 'ScienceFiction'] | Rating: 5.4
    Man of Steel | Genres: ['Action', 'Adventure', 'Fantasy', 'ScienceFiction'] | Rating: 6.5
    X-Men: Days of Future Past | Genres: ['Action', 'Adventure', 'Fantasy', 'ScienceFiction'] | Rating: 7.5

    Selected Movie:  Titanic 

    Revolutionary Road | Genres: ['Drama', 'Romance'] | Rating: 6.7
    The Great Gatsby | Genres: ['Drama', 'Romance'] | Rating: 7.3
    Romeo + Juliet | Genres: ['Drama', 'Romance'] | Rating: 6.7
    Iris | Genres: ['Drama', 'Romance'] | Rating: 6.2
    The Beach | Genres: ['Drama', 'Adventure', 'Romance', 'Thriller'] | Rating: 6.3
    All the King's Men | Genres: ['Drama', 'Thriller'] | Rating: 5.7
    The Reader | Genres: ['Drama', 'Romance'] | Rating: 7.2
    Sense and Sensibility | Genres: ['Drama', 'Romance'] | Rating: 7.2
    Little Children | Genres: ['Romance', 'Drama'] | Rating: 6.9
    What's Eating Gilbert Grape | Genres: ['Romance', 'Drama'] | Rating: 7.5
    Cruel Intentions | Genres: ['Drama', 'Romance', 'Thriller'] | Rating: 6.6

    Selected Movie:  Star Trek Into Darkness 

    Star Trek Beyond | Genres: ['Action', 'Adventure', 'ScienceFiction'] | Rating: 6.6
    Star Trek | Genres: ['ScienceFiction', 'Action', 'Adventure'] | Rating: 7.4
    Avatar | Genres: ['Action', 'Adventure', 'Fantasy', 'ScienceFiction'] | Rating: 7.2
    Transformers: Age of Extinction | Genres: ['ScienceFiction', 'Action', 'Adventure'] | Rating: 5.8
    Guardians of the Galaxy | Genres: ['Action', 'ScienceFiction', 'Adventure'] | Rating: 7.9
    Captain America: Civil War | Genres: ['Adventure', 'Action', 'ScienceFiction'] | Rating: 7.1
    Oblivion | Genres: ['Action', 'ScienceFiction', 'Adventure', 'Mystery'] | Rating: 6.4
    Pacific Rim | Genres: ['Action', 'ScienceFiction', 'Adventure'] | Rating: 6.7
    Avengers: Age of Ultron | Genres: ['Action', 'Adventure', 'ScienceFiction'] | Rating: 7.3
    Riddick | Genres: ['ScienceFiction', 'Action', 'Thriller'] | Rating: 6.2
    Ender's Game | Genres: ['ScienceFiction', 'Action', 'Adventure'] | Rating: 6.6
"""

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from icecream import ic

# Sample text strings
A = 'London Paris London'
B = 'Paris Paris London'

# Create an instance of CountVectorizer
cv = CountVectorizer()

# Convert the text strings into a frequency matrix
# Each unique word becomes a feature (column) in the matrix
matrix = cv.fit_transform([A, B])

# Calculate the cosine similarity between the vectors
# Since there are two documents, this results in a 2x2 matrix
# Diagonal elements are the self-similarity scores (1.0)
# Off-diagonal elements are the cross-document similarity scores
similarity_scores = cosine_similarity(matrix)

ic(cv.get_feature_names_out())
ic(matrix)
ic(matrix.toarray())
ic(similarity_scores)

"""
    ic| cv.get_feature_names_out(): array(['london', 'paris'], dtype=object)
    ic| matrix: <2x2 sparse matrix of type '<class 'numpy.int64'>'
                    with 4 stored elements in Compressed Sparse Row format>
    ic| matrix.toarray(): array([[2, 1],
                                [1, 2]])
    ic| similarity_scores: array([[1. , 0.8],
                                [0.8, 1. ]])
"""

""" Knn / Movie recommendation system (scikit)
"""

import pathlib
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

DIR = pathlib.Path(__file__).resolve().parent
movies = pd.read_csv(DIR / 'data/movies_dataset2.csv')

# New combined feature
def combine_features(row):
    return str(row['genres']) + " " + str(row['cast']) + " " + str(row['keywords'])

movies['combined_features'] = movies.apply(combine_features, axis=1)

# Similarity
cv = CountVectorizer()
count_matrix = cv.fit_transform(movies['combined_features'])
cosine_similarity = cosine_similarity(count_matrix)

# Find the 10 most similiar movies
def predict_movies(name):
    index = movies[movies['original_title'].str.contains(name)].index[0]
    similar_movies = list(enumerate(cosine_similarity[index]))

# Sort by score in descending order
    similar_sorted = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:]

# Output results
    for neighbor in similar_sorted[:10]:
        movie_index = neighbor[0]
        data = movies.iloc[movie_index]

original_title = data['original_title']
        genres = data['genres']
        rating = str(data['vote_average'])
        print(original_title + " | " + genres + " | Rating: " + rating) 
        
predict_movies('Avatar')

import numpy as np
import pandas as pd
import pickle
from scipy import spatial

import pathlib
DIR = pathlib.Path(__file__).resolve().parent

# Load the preprocessed data
with open(DIR / 'data/movies_processed.pkl', 'rb') as f:

# Deserialize the data from the file and assign it to variable
    movies = pickle.load(f)

def similarity(a, b):
    """
    Calculate the similarity between two movies.

Args:
    a (dict): A dictionary containing features of the first movie.
    b (dict): A dictionary containing features of the second movie.

Returns:
    float: A similarity score, where a lower score indicates greater similarity.
    """

# Calculate the cosine distance for the genres of the movies
    d1 = spatial.distance.cosine(a['genres_bin'], b['genres_bin'])

# Calculate the cosine distance for the cast of the movies
    d2 = spatial.distance.cosine(a['cast_bin'], b['cast_bin'])

# Calculate the cosine distance for the keywords of the movies
    d3 = spatial.distance.cosine(a['keywords_bin'], b['keywords_bin'])

# Return the sum of the three distances as the overall similarity score
    return d1 + d2 + d3

def predict_similar_movies(movie_title):
    name = "Avatar" # input('Enter a movie title: ')
    new_movie = movies[movies['original_title'].str.contains(movie_title, case=False, regex=False)].iloc[0]
    print('\nSelected Movie: ', new_movie.original_title, "\n")

# Convert new_movie to a series for efficiency
    new_movie_series = new_movie[['genres_bin', 'cast_bin', 'keywords_bin']]

# Calculate distances in a vectorized way
    distances = movies.apply(lambda x: similarity(new_movie_series, x) 
                             if x['new_id'] != new_movie['new_id'] else np.inf, axis=1)

# Get the 10 most similar movies
    nearest_neighbors = distances.nsmallest(11).index

for neighbor_idx in nearest_neighbors:
        if neighbor_idx != new_movie.name:  # Exclude the selected movie itself
            neighbor = movies.loc[neighbor_idx]
            print(f"{neighbor['original_title']} | Genres: {neighbor['genres']} | Rating: {neighbor['vote_average']}")

predict_similar_movies("Avatar")
predict_similar_movies("Titanic")
predict_similar_movies("Star Trek")

"""
    Selected Movie:  Avatar

Star Trek Into Darkness | Genres: ['Action', 'Adventure', 'ScienceFiction'] | Rating: 7.4
    Jupiter Ascending | Genres: ['ScienceFiction', 'Fantasy', 'Action', 'Adventure'] | Rating: 5.2
    Guardians of the Galaxy | Genres: ['Action', 'ScienceFiction', 'Adventure'] | Rating: 7.9
    Clash of the Titans | Genres: ['Adventure', 'Fantasy', 'Action'] | Rating: 5.6
    John Carter | Genres: ['Action', 'Adventure', 'ScienceFiction'] | Rating: 6.1
    Pirates of the Caribbean: On Stranger Tides | Genres: ['Adventure', 'Action', 'Fantasy'] | Rating: 6.4
    The Fifth Element | Genres: ['Adventure', 'Fantasy', 'Action', 'Thriller', 'ScienceFiction'] | Rating: 7.3
    The Time Machine | Genres: ['ScienceFiction', 'Adventure', 'Action'] | Rating: 5.8
    Superman Returns | Genres: ['Adventure', 'Fantasy', 'Action', 'ScienceFiction'] | Rating: 5.4
    Man of Steel | Genres: ['Action', 'Adventure', 'Fantasy', 'ScienceFiction'] | Rating: 6.5
    X-Men: Days of Future Past | Genres: ['Action', 'Adventure', 'Fantasy', 'ScienceFiction'] | Rating: 7.5

Selected Movie:  Titanic

Selected Movie:  Star Trek Into Darkness

 Last update: 124 days ago

minte9 LearnRemember

Count Vectorizer

Vectors Similarity

Combine Features

Movies Similarity

minte9
LearnRemember