Fetch and prepare reviews#
Short example showing how you can parse a CSV file into parquet files partitioned by year.
import pandas as pd
REVIEWS = (
"https://bhciaaablob.blob.core.windows.net/cmotionsnlpblogs/RestoReviewRawdata.csv"
)
resto = pd.read_csv(REVIEWS, decimal=",")
# fix utf-8 encoding
resto.reviewText = resto.reviewText.fillna("b''").apply(
lambda b: eval(b + ".decode('utf-8')")
)
# write to single file
resto.to_csv("data/dutch-restaurant-reviews.csv")
resto.to_parquet("data/dutch-restaurant-reviews.parquet")
# write to partitioned parquet files
resto["reviewYear"] = resto.reviewDate.str[-4:].astype("float").astype("Int64")
resto.to_parquet(
"data/dutch-restaurant-reviews-per-year", partition_cols=["reviewYear"]
)
# write parquet file single year
for year in (2017, 2018):
resto[resto.reviewYear == year].to_parquet(f'data/dutch-restaurant-reviews-{year}')
resto.reviewText[resto.reviewText.isna()]
Series([], Name: reviewText, dtype: object)
resto.shape
(379718, 25)
resto.iloc[379422,:]
restoId 256599
restoName Pasta di Mamma (Foodhallen)
tags ITALIAANS|AMSTERDAM-WEST|KINKERBUURT|PROVINCIE...
address Bellamyplein 51 1053 Amsterdam Nederland
scoreTotal NaN
avgPrice NaN
numReviews 0
scoreFood NaN
scoreService NaN
scoreDecor NaN
review_id 0.0
numreviews2 NaN
valueForPriceScore NaN
noiseLevelScore NaN
waitingTimeScore NaN
reviewerId NaN
reviewerFame NaN
reviewerNumReviews NaN
reviewDate NaN
reviewScoreOverall NaN
reviewScoreFood NaN
reviewScoreService NaN
reviewScoreAmbiance NaN
reviewText
reviewYear <NA>
Name: 379422, dtype: object