-
Download the IMDB Dataset.
-
Execute
node create-db-copy-files.jsto create the copy files. -
Copy the files to the server.
scp *.sql ip-address:/home/imdb -
On the server, execute the following commands:
dokku postgres:create hfr-imdb
dokku postgres:expose hfr-imdb
DB_URL=$(dokku postgres:info hfr-imdb --dsn | sed 's/@.*:/@localhost:/')
NEW_DB=$(echo $DB_URL | sed 's/5432\/.*/5432\/imdb/')
echo $DB_URL > imdb/DB_URL
echo $NEW_DB > imdb/NEW_DB
docker cp imdb/ $(dokku postgres:info hfr-imdb --id):/home/imdb
dokku postgres:enter hfr-imdb
cd /home/imdb
psql $(cat DB_URL) -f ./_1-create-db.sql
psql $(cat NEW_DB) -f ./_2-setup-db.sql
psql $(cat NEW_DB) -f ./_3-restore-db.sql
Troubleshooting​
Problem: COPY does not work when a single line is erroneous.
ERROR: insert or update on table "humans_movies" violates foreign key constraint "humans_movies_movie_id_fkey"
DETAIL: Key (movie_id)=(tt16607280) is not present in table "movies"
To remove the erroneous line, you can use the following command:
sed -i '/tt16607280/d' humans_movies.sql
and reimport this file:
psql $(cat NEW_DB) -c "COPY humans_movies FROM '/home/imdb/humans_movies.sql';"
Unknown Entries 16.08.2023
The following keys are in humans_movies.sql but not found in movies.sql:
tt8885954tt16607280tt14469846tt27415372tt27866357tt10857444tt9676044tt9685030tt12247918tt11812250tt10452116tt14703146tt27411480tt13853058tt22183860tt14273900tt18116552tt3644138tt20356568tt21051790tt3435486tt21411828tt22050582tt27685422tt21237486tt23767348tt27147468tt25429502tt27493989tt27936112tt28108823tt27310698tt27929191tt7303822
eventually it is easier at one point to split the file into multiple files:
NR=$(awk '/tt21237486/{ print NR; exit }' humans_movies.sql)
split -l $NR humans_movies.sql human_movs_
Index​
-- for text search gin + trigram indexes (reduces search time from ~3s to ~50ms)
CREATE INDEX ON movies USING gin (primary_title gin_trgm_ops);
CREATE INDEX ON movies USING gin (original_title gin_trgm_ops);
CREATE INDEX ON humans USING gin (name gin_trgm_ops);
-- for eq-comparison a normal btree index is faster
CREATE INDEX ON movies(primary_title);
CREATE INDEX ON movies(original_title);
CREATE INDEX ON humans(name);
CREATE INDEX ON movies(type);
CREATE INDEX ON movies(primary_title);
CREATE INDEX ON movies(original_title);
CREATE INDEX ON movies(is_adult);
CREATE INDEX ON movies(start_year);
CREATE INDEX ON movies(end_year);
CREATE INDEX ON movies(runtime_minutes);
CREATE INDEX ON humans(name);
CREATE INDEX ON humans(birth_year);
CREATE INDEX ON humans(death_year);
CREATE INDEX ON humans_movies(human_id);
CREATE INDEX ON humans_movies(movie_id);
CREATE INDEX ON ratings(averageRating);
CREATE INDEX ON ratings(numVotes);
CREATE INDEX ON ratings(movie_id);
Setup IMDB Database