Fetch and prepare reviews#

Short example showing how you can parse a CSV file into parquet files partitioned by year.

import pandas as pd


REVIEWS = (
    "https://bhciaaablob.blob.core.windows.net/cmotionsnlpblogs/RestoReviewRawdata.csv"
)
resto = pd.read_csv(REVIEWS, decimal=",")
# fix utf-8 encoding
resto.reviewText = resto.reviewText.fillna("b''").apply(
    lambda b: eval(b + ".decode('utf-8')")
)
# write to single file
resto.to_csv("data/dutch-restaurant-reviews.csv")
resto.to_parquet("data/dutch-restaurant-reviews.parquet")

# write to partitioned parquet files
resto["reviewYear"] = resto.reviewDate.str[-4:].astype("float").astype("Int64")
resto.to_parquet(
    "data/dutch-restaurant-reviews-per-year", partition_cols=["reviewYear"]
)

# write parquet file single year
for year in (2017, 2018):
    resto[resto.reviewYear == year].to_parquet(f'data/dutch-restaurant-reviews-{year}')
resto.reviewText[resto.reviewText.isna()]
Series([], Name: reviewText, dtype: object)
resto.shape
(379718, 25)
resto.iloc[379422,:]
restoId                                                           256599
restoName                                    Pasta di Mamma (Foodhallen)
tags                   ITALIAANS|AMSTERDAM-WEST|KINKERBUURT|PROVINCIE...
address                         Bellamyplein 51 1053 Amsterdam Nederland
scoreTotal                                                           NaN
avgPrice                                                             NaN
numReviews                                                             0
scoreFood                                                            NaN
scoreService                                                         NaN
scoreDecor                                                           NaN
review_id                                                            0.0
numreviews2                                                          NaN
valueForPriceScore                                                   NaN
noiseLevelScore                                                      NaN
waitingTimeScore                                                     NaN
reviewerId                                                           NaN
reviewerFame                                                         NaN
reviewerNumReviews                                                   NaN
reviewDate                                                           NaN
reviewScoreOverall                                                   NaN
reviewScoreFood                                                      NaN
reviewScoreService                                                   NaN
reviewScoreAmbiance                                                  NaN
reviewText                                                              
reviewYear                                                          <NA>
Name: 379422, dtype: object