Objective#

Predict a ‘detractor’ such that restaurant owner can look-up interesting (negative) feedback and act upon that.

Initialize notebook#

Import dependencies#

import pandas as pd
import spacy

from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier

Set configuration#

# Make sure all columns are printed when printing a DataFrame.
pd.set_option("display.max_columns", 50)
pd.set_option('display.max_colwidth', 250)

Initialize spaCy model#

nlp = spacy.load("nl_core_news_lg")

Initialize data#

Read data from source#

data_source = "https://bhciaaablob.blob.core.windows.net/cmotionsnlpblogs/RestoReviewRawdata.csv"
raw_df = pd.read_csv(filepath_or_buffer=data_source,
                     decimal=",",
                     dtype={"restoId": pd.Int64Dtype(),
                            "restoName": str,
                            "tags": str,
                            "address": str,
                            "scoreTotal": float,
                            "avgPrice": str,
                            "numReviews": pd.Int64Dtype(),
                            "scoreFood": float,
                            "scoreService": float,
                            "scoreDecor": float,
                            "review_id": pd.Int64Dtype(),
                            "numreviews2": pd.Int64Dtype(),
                            "valueForPriceScore": str,
                            "noiseLevelScore": str,
                            "waitingTimeScore": str,
                            "reviewerId": str,
                            "reviewerFame": str,
                            "reviewerNumReviews": pd.Int64Dtype(),
                            "reviewDate": str,
                            "reviewScoreOverall": float,
                            "reviewScoreFood": float,
                            "reviewScoreService": float,
                            "reviewScoreAmbiance": float,
                            "reviewText": str})

Inspect raw data#

raw_df.info()
raw_df.sample(n=3, random_state=42)

Prepare data#

Create copy of the raw data#

reviews = raw_df.copy()

Drop irrelevant columns#

print(f"Shape before: {reviews.shape}")

reviews = reviews.drop(columns=['restoName', 'tags', 'address', 'review_id', 'reviewDate'])

print(f"Shape after: {reviews.shape}")

Drop rows with null values#

print(f"Shape before: {reviews.shape}")

reviews = reviews.dropna(axis='index')

print(f"Shape after: {reviews.shape}")

Drop invalid reviews#

def is_invalid_review(review):
    if ("- Recensie is momenteel in behandeling -" in review) or len(review) <= 3:
        return True
    else:
        return False


print(f"Shape before: {reviews.shape}")

reviews = reviews.drop(index=reviews[reviews['reviewText'].apply(lambda x: is_invalid_review(x))].index)

print(f"Shape after: {reviews.shape}")

Convert reviewText from “bytes” to “string”#

def bytes_to_string(bytes):
    '''
    Concat the passed argument with the UTF-8 decode statement and then evaluate the resulting statement.
    '''
    return eval(bytes + ".decode('utf-8')")


reviews['reviewText'] = reviews['reviewText'].apply(bytes_to_string)

Format avgPrice#

def clean_price(string):
    '''
    Return the last piece of the passed argument, or 0 if argument is null.
    '''
    return float(string.split(" ")[-1])


# Remove the euro sign and leading whitespace from the price.
reviews["avgPrice"] = reviews["avgPrice"].apply(clean_price)

Format ordinal columns#

map_scores = {
    "waitingTimeScore": {
        None: 0,
        "Hoog tempo": 1,
        "Kort": 2,
        "Redelijk": 3,
        "Kan beter": 4,
        "Lang": 5,
    },
    "valueForPriceScore": {
        None: 0,
        "Erg gunstig": 1,
        "Gunstig": 2,
        "Redelijk": 3,
        "Precies goed": 4,
        "Kan beter": 5,
    },
    "noiseLevelScore": {
        None: 0,
        "Erg rustig": 1,
        "Rustig": 2,
        "Precies goed": 3,
        "Rumoerig": 4,
    },
    "reviewerFame": {
        None: 0,
        "Proever": 1,
        "Fijnproever": 2,
        "Expertproever": 3,
        "Meesterproever": 4
    }
}

for col in map_scores.keys():
    reviews[col] = reviews[col].apply(lambda x: map_scores[col].get(x, None)).astype("Int64")

Add feature: length of reviewText#

reviews['reviewTextLength'] = reviews['reviewText'].apply(lambda x: len(x)).astype("Int64")

Inspect preprocessed data#

reviews.info()
reviews.sample(n=3, random_state=42)

Text pre-processing#

Specify a subset of the data to be used for training#

When using full data, the cells below may take very long to complete.
Using 25% of the data it will take about 15 minutes to complete on a laptop.

reviews = reviews.sample(frac=0.25, random_state=42)

Tokenize and create Document-Term Matrix#

# %%time
# THIS CELL TAKES ABOUT 12 MINUTES ON A LAPTOP WHEN USING 25% THE DATA.

def tokenize_simple(text):
    '''
    Tokenizer returning lowercase tokens with no stop words, no punctuation and no words with encoding errors.
    '''
    doc = nlp(text)
    return [token.lower_ for token in doc if not (token.is_stop or token.is_punct or ("\\" in token.lower_))]


# Some abbreviations aren't in spaCy's default Dutch stopwords list, so we add them.
nlp.Defaults.stop_words.update(['n', '’n', 't'])

# Define the CountVectorizer, use n-grams of length 1.
count_vectorizer = CountVectorizer(tokenizer=tokenize_simple,
                                   stop_words=nlp.Defaults.stop_words,
                                   ngram_range=(1, 1))

# Fit the CountVectorizer. The returned Document-Term Matrix is put in a DataFrame.
dtm = pd.DataFrame.sparse.from_spmatrix(data=count_vectorizer.fit_transform(reviews['reviewText']),
                                        index=reviews['reviewText'].index,
                                        columns=count_vectorizer.get_feature_names_out())
# Inspect the tokenization to the original reviewText for some observations.
display(pd.DataFrame(reviews.head(n=3)['reviewText']))

subset = dtm.head(n=3).copy()
display(subset.loc[:, subset.ne(0).any()])

Select only those words that occur twice or more.#

print(f"Full DTM: {dtm.shape}")
dtm = dtm.loc[:, (dtm.sum() > 2)]
print(f"Filtered DTM: {dtm.shape}")

Binary classification#

Define Y#

reviews["is_detractor"] = reviews['reviewScoreOverall'].apply(lambda x: True if x <= 6 else False)

Train-test split#

# %%time
# THIS CELL TAKES ABOUT 1 MINUTE ON A LAPTOP WHEN USING 25% THE DATA.

X = reviews.drop(columns=['reviewText','reviewScoreOverall','is_detractor'])
y = reviews['is_detractor']

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    X_train_dtm, X_test_dtm = dtm.iloc[train_index, :], dtm.iloc[test_index, :]

BalancedBaggingClassifier - using structured data#

clf1 = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                 sampling_strategy='auto',
                                 replacement=False,
                                 random_state=0)
clf1.fit(X_train, y_train)
balanced_accuracy_score(y_test, clf1.predict(X_test)).round(3)

BalancedBaggingClassifier - using DTM#

clf2 = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                 sampling_strategy='auto',
                                 replacement=False,
                                 random_state=0)
clf2.fit(X_train_dtm, y_train)
balanced_accuracy_score(y_test, clf2.predict(X_test_dtm)).round(3)