Objective#
Predict a ‘detractor’ such that restaurant owner can look-up interesting (negative) feedback and act upon that.
Initialize notebook#
Import dependencies#
import pandas as pd
import spacy
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier
Set configuration#
# Make sure all columns are printed when printing a DataFrame.
pd.set_option("display.max_columns", 50)
pd.set_option('display.max_colwidth', 250)
Initialize spaCy model#
nlp = spacy.load("nl_core_news_lg")
Initialize data#
Read data from source#
data_source = "https://bhciaaablob.blob.core.windows.net/cmotionsnlpblogs/RestoReviewRawdata.csv"
raw_df = pd.read_csv(filepath_or_buffer=data_source,
decimal=",",
dtype={"restoId": pd.Int64Dtype(),
"restoName": str,
"tags": str,
"address": str,
"scoreTotal": float,
"avgPrice": str,
"numReviews": pd.Int64Dtype(),
"scoreFood": float,
"scoreService": float,
"scoreDecor": float,
"review_id": pd.Int64Dtype(),
"numreviews2": pd.Int64Dtype(),
"valueForPriceScore": str,
"noiseLevelScore": str,
"waitingTimeScore": str,
"reviewerId": str,
"reviewerFame": str,
"reviewerNumReviews": pd.Int64Dtype(),
"reviewDate": str,
"reviewScoreOverall": float,
"reviewScoreFood": float,
"reviewScoreService": float,
"reviewScoreAmbiance": float,
"reviewText": str})
Inspect raw data#
raw_df.info()
raw_df.sample(n=3, random_state=42)
Prepare data#
Create copy of the raw data#
reviews = raw_df.copy()
Drop irrelevant columns#
print(f"Shape before: {reviews.shape}")
reviews = reviews.drop(columns=['restoName', 'tags', 'address', 'review_id', 'reviewDate'])
print(f"Shape after: {reviews.shape}")
Drop rows with null values#
print(f"Shape before: {reviews.shape}")
reviews = reviews.dropna(axis='index')
print(f"Shape after: {reviews.shape}")
Drop invalid reviews#
def is_invalid_review(review):
if ("- Recensie is momenteel in behandeling -" in review) or len(review) <= 3:
return True
else:
return False
print(f"Shape before: {reviews.shape}")
reviews = reviews.drop(index=reviews[reviews['reviewText'].apply(lambda x: is_invalid_review(x))].index)
print(f"Shape after: {reviews.shape}")
Convert reviewText from “bytes” to “string”#
def bytes_to_string(bytes):
'''
Concat the passed argument with the UTF-8 decode statement and then evaluate the resulting statement.
'''
return eval(bytes + ".decode('utf-8')")
reviews['reviewText'] = reviews['reviewText'].apply(bytes_to_string)
Format avgPrice#
def clean_price(string):
'''
Return the last piece of the passed argument, or 0 if argument is null.
'''
return float(string.split(" ")[-1])
# Remove the euro sign and leading whitespace from the price.
reviews["avgPrice"] = reviews["avgPrice"].apply(clean_price)
Format ordinal columns#
map_scores = {
"waitingTimeScore": {
None: 0,
"Hoog tempo": 1,
"Kort": 2,
"Redelijk": 3,
"Kan beter": 4,
"Lang": 5,
},
"valueForPriceScore": {
None: 0,
"Erg gunstig": 1,
"Gunstig": 2,
"Redelijk": 3,
"Precies goed": 4,
"Kan beter": 5,
},
"noiseLevelScore": {
None: 0,
"Erg rustig": 1,
"Rustig": 2,
"Precies goed": 3,
"Rumoerig": 4,
},
"reviewerFame": {
None: 0,
"Proever": 1,
"Fijnproever": 2,
"Expertproever": 3,
"Meesterproever": 4
}
}
for col in map_scores.keys():
reviews[col] = reviews[col].apply(lambda x: map_scores[col].get(x, None)).astype("Int64")
Add feature: length of reviewText#
reviews['reviewTextLength'] = reviews['reviewText'].apply(lambda x: len(x)).astype("Int64")
Inspect preprocessed data#
reviews.info()
reviews.sample(n=3, random_state=42)
Text pre-processing#
Specify a subset of the data to be used for training#
When using full data, the cells below may take very long to complete.
Using 25% of the data it will take about 15 minutes to complete on a laptop.
reviews = reviews.sample(frac=0.25, random_state=42)
Tokenize and create Document-Term Matrix#
# %%time
# THIS CELL TAKES ABOUT 12 MINUTES ON A LAPTOP WHEN USING 25% THE DATA.
def tokenize_simple(text):
'''
Tokenizer returning lowercase tokens with no stop words, no punctuation and no words with encoding errors.
'''
doc = nlp(text)
return [token.lower_ for token in doc if not (token.is_stop or token.is_punct or ("\\" in token.lower_))]
# Some abbreviations aren't in spaCy's default Dutch stopwords list, so we add them.
nlp.Defaults.stop_words.update(['n', '’n', 't'])
# Define the CountVectorizer, use n-grams of length 1.
count_vectorizer = CountVectorizer(tokenizer=tokenize_simple,
stop_words=nlp.Defaults.stop_words,
ngram_range=(1, 1))
# Fit the CountVectorizer. The returned Document-Term Matrix is put in a DataFrame.
dtm = pd.DataFrame.sparse.from_spmatrix(data=count_vectorizer.fit_transform(reviews['reviewText']),
index=reviews['reviewText'].index,
columns=count_vectorizer.get_feature_names_out())
# Inspect the tokenization to the original reviewText for some observations.
display(pd.DataFrame(reviews.head(n=3)['reviewText']))
subset = dtm.head(n=3).copy()
display(subset.loc[:, subset.ne(0).any()])
Select only those words that occur twice or more.#
print(f"Full DTM: {dtm.shape}")
dtm = dtm.loc[:, (dtm.sum() > 2)]
print(f"Filtered DTM: {dtm.shape}")
Binary classification#
Define Y#
reviews["is_detractor"] = reviews['reviewScoreOverall'].apply(lambda x: True if x <= 6 else False)
Train-test split#
# %%time
# THIS CELL TAKES ABOUT 1 MINUTE ON A LAPTOP WHEN USING 25% THE DATA.
X = reviews.drop(columns=['reviewText','reviewScoreOverall','is_detractor'])
y = reviews['is_detractor']
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X, y):
X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
X_train_dtm, X_test_dtm = dtm.iloc[train_index, :], dtm.iloc[test_index, :]
BalancedBaggingClassifier - using structured data#
clf1 = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
sampling_strategy='auto',
replacement=False,
random_state=0)
clf1.fit(X_train, y_train)
balanced_accuracy_score(y_test, clf1.predict(X_test)).round(3)
BalancedBaggingClassifier - using DTM#
clf2 = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
sampling_strategy='auto',
replacement=False,
random_state=0)
clf2.fit(X_train_dtm, y_train)
balanced_accuracy_score(y_test, clf2.predict(X_test_dtm)).round(3)