WHERE vs Join clauses using Postgres Airlines sample database
Published on Oct 30, 2024 10:09 AM
0
In [2]:
HOST = "192.124.206.2"
In [3]:
import psycopg2
import pandas as pd
# Connect to the database
conn = psycopg2.connect(
host=HOST,
database="airlines",
user="datawars",
password=""
)
# Read the bookings table
bookings = pd.read_sql("SELECT * FROM bookings", conn)
print(bookings.head())
/tmp/ipykernel_15/2123935338.py:13: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.
bookings = pd.read_sql("SELECT * FROM bookings", conn)
book_ref book_date total_amount 0 00000F 2017-07-05 00:12:00+00:00 265700.0 1 000012 2017-07-14 06:02:00+00:00 37900.0 2 00002D 2017-05-20 15:45:00+00:00 114700.0 3 000068 2017-08-15 11:27:00+00:00 18100.0 4 0000C9 2017-06-30 12:52:00+00:00 54600.0
In [20]:
query1 = """
SELECT
t.passenger_name,
t.ticket_no,
bp.seat_no
FROM
Flights f
JOIN
Ticket_flights tf ON f.flight_id = tf.flight_id
JOIN
Tickets t ON tf.ticket_no = t.ticket_no
JOIN
Boarding_passes bp ON t.ticket_no = bp.ticket_no AND tf.flight_id = bp.flight_id
WHERE
f.arrival_airport = 'OVB';
"""
In [21]:
query2 = """
SELECT
t.passenger_name,
t.ticket_no,
bp.seat_no
FROM
Flights f
JOIN
Ticket_flights tf ON (f.flight_id = tf.flight_id AND f.arrival_airport = 'OVB')
JOIN
Tickets t ON tf.ticket_no = t.ticket_no
JOIN
Boarding_passes bp ON t.ticket_no = bp.ticket_no AND tf.flight_id = bp.flight_id
"""
In [38]:
cur = conn.cursor()
cur.execute(f"EXPLAIN (FORMAT JSON) {query1}")
In [39]:
plan = cur.fetchone()
plan
Out[39]:
([{'Plan': {'Node Type': 'Gather',
'Parallel Aware': False,
'Startup Cost': 2617.76,
'Total Cost': 36544.08,
'Plan Rows': 9462,
'Plan Width': 108,
'Workers Planned': 2,
'Single Copy': False,
'Plans': [{'Node Type': 'Nested Loop',
'Parent Relationship': 'Outer',
'Parallel Aware': False,
'Join Type': 'Inner',
'Startup Cost': 1617.76,
'Total Cost': 34597.88,
'Plan Rows': 3942,
'Plan Width': 108,
'Inner Unique': True,
'Plans': [{'Node Type': 'Nested Loop',
'Parent Relationship': 'Outer',
'Parallel Aware': False,
'Join Type': 'Inner',
'Startup Cost': 1617.33,
'Total Cost': 32739.41,
'Plan Rows': 3942,
'Plan Width': 132,
'Inner Unique': True,
'Join Filter': '(f.flight_id = tf.flight_id)',
'Plans': [{'Node Type': 'Hash Join',
'Parent Relationship': 'Outer',
'Parallel Aware': False,
'Join Type': 'Inner',
'Startup Cost': 1616.9,
'Total Cost': 25510.82,
'Plan Rows': 3942,
'Plan Width': 84,
'Inner Unique': True,
'Hash Cond': '(bp.flight_id = f.flight_id)',
'Plans': [{'Node Type': 'Seq Scan',
'Parent Relationship': 'Outer',
'Parallel Aware': True,
'Relation Name': 'boarding_passes',
'Alias': 'bp',
'Startup Cost': 0.0,
'Total Cost': 21821.9,
'Plan Rows': 789290,
'Plan Width': 80},
{'Node Type': 'Hash',
'Parent Relationship': 'Inner',
'Parallel Aware': False,
'Startup Cost': 1612.8,
'Total Cost': 1612.8,
'Plan Rows': 328,
'Plan Width': 4,
'Plans': [{'Node Type': 'Seq Scan',
'Parent Relationship': 'Outer',
'Parallel Aware': False,
'Relation Name': 'flights',
'Alias': 'f',
'Startup Cost': 0.0,
'Total Cost': 1612.8,
'Plan Rows': 328,
'Plan Width': 4,
'Filter': "(arrival_airport = 'OVB'::bpchar)"}]}]},
{'Node Type': 'Index Only Scan',
'Parent Relationship': 'Inner',
'Parallel Aware': False,
'Scan Direction': 'Forward',
'Index Name': 'ticket_flights_pkey',
'Relation Name': 'ticket_flights',
'Alias': 'tf',
'Startup Cost': 0.43,
'Total Cost': 1.1,
'Plan Rows': 59,
'Plan Width': 60,
'Index Cond': '((ticket_no = bp.ticket_no) AND (flight_id = bp.flight_id))'}]},
{'Node Type': 'Index Scan',
'Parent Relationship': 'Inner',
'Parallel Aware': False,
'Scan Direction': 'Forward',
'Index Name': 'tickets_pkey',
'Relation Name': 'tickets',
'Alias': 't',
'Startup Cost': 0.42,
'Total Cost': 0.47,
'Plan Rows': 1,
'Plan Width': 88,
'Index Cond': '(ticket_no = tf.ticket_no)'}]}]}}],)
In [40]:
cur = conn.cursor()
cur.execute(f"EXPLAIN (FORMAT JSON) {query2}")
In [41]:
plan = cur.fetchone()
plan
Out[41]:
([{'Plan': {'Node Type': 'Gather',
'Parallel Aware': False,
'Startup Cost': 2617.76,
'Total Cost': 36544.08,
'Plan Rows': 9462,
'Plan Width': 108,
'Workers Planned': 2,
'Single Copy': False,
'Plans': [{'Node Type': 'Nested Loop',
'Parent Relationship': 'Outer',
'Parallel Aware': False,
'Join Type': 'Inner',
'Startup Cost': 1617.76,
'Total Cost': 34597.88,
'Plan Rows': 3942,
'Plan Width': 108,
'Inner Unique': True,
'Plans': [{'Node Type': 'Nested Loop',
'Parent Relationship': 'Outer',
'Parallel Aware': False,
'Join Type': 'Inner',
'Startup Cost': 1617.33,
'Total Cost': 32739.41,
'Plan Rows': 3942,
'Plan Width': 132,
'Inner Unique': True,
'Join Filter': '(f.flight_id = tf.flight_id)',
'Plans': [{'Node Type': 'Hash Join',
'Parent Relationship': 'Outer',
'Parallel Aware': False,
'Join Type': 'Inner',
'Startup Cost': 1616.9,
'Total Cost': 25510.82,
'Plan Rows': 3942,
'Plan Width': 84,
'Inner Unique': True,
'Hash Cond': '(bp.flight_id = f.flight_id)',
'Plans': [{'Node Type': 'Seq Scan',
'Parent Relationship': 'Outer',
'Parallel Aware': True,
'Relation Name': 'boarding_passes',
'Alias': 'bp',
'Startup Cost': 0.0,
'Total Cost': 21821.9,
'Plan Rows': 789290,
'Plan Width': 80},
{'Node Type': 'Hash',
'Parent Relationship': 'Inner',
'Parallel Aware': False,
'Startup Cost': 1612.8,
'Total Cost': 1612.8,
'Plan Rows': 328,
'Plan Width': 4,
'Plans': [{'Node Type': 'Seq Scan',
'Parent Relationship': 'Outer',
'Parallel Aware': False,
'Relation Name': 'flights',
'Alias': 'f',
'Startup Cost': 0.0,
'Total Cost': 1612.8,
'Plan Rows': 328,
'Plan Width': 4,
'Filter': "(arrival_airport = 'OVB'::bpchar)"}]}]},
{'Node Type': 'Index Only Scan',
'Parent Relationship': 'Inner',
'Parallel Aware': False,
'Scan Direction': 'Forward',
'Index Name': 'ticket_flights_pkey',
'Relation Name': 'ticket_flights',
'Alias': 'tf',
'Startup Cost': 0.43,
'Total Cost': 1.1,
'Plan Rows': 59,
'Plan Width': 60,
'Index Cond': '((ticket_no = bp.ticket_no) AND (flight_id = bp.flight_id))'}]},
{'Node Type': 'Index Scan',
'Parent Relationship': 'Inner',
'Parallel Aware': False,
'Scan Direction': 'Forward',
'Index Name': 'tickets_pkey',
'Relation Name': 'tickets',
'Alias': 't',
'Startup Cost': 0.42,
'Total Cost': 0.47,
'Plan Rows': 1,
'Plan Width': 88,
'Index Cond': '(ticket_no = tf.ticket_no)'}]}]}}],)
Other tests¶
In [3]:
q1 = """SELECT airport_code, airport_na
me
FROM airports_data
WHERE timezone = 'Europe/Moscow'
UNION
SELECT airport_code, airport_name
FROM airports_data
WHERE timezone = 'Asia/Yekaterinburg'
ORDER BY airport_code;
"""
In [4]:
q2 = """SELECT airport_code, airport_name
FROM airports_data
WHERE timezone IN ('Europe/Moscow', 'Asia/Yekaterinburg')
ORDER BY airport_code"""
In [6]:
# Both queries return the same thing
pd.testing.assert_frame_equal(pd.read_sql(q1, conn), pd.read_sql(q2, conn))
/tmp/ipykernel_15/539518604.py:2: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy. pd.testing.assert_frame_equal(pd.read_sql(q1, conn), pd.read_sql(q2, conn))
Analyze User query¶
In [ ]:
def extract_node_types(plan):
node_types = [plan.get('Node Type')]
if 'Plans' in plan:
for subplan in plan['Plans']:
node_types.extend(extract_node_types(subplan))
return node_types
Query 1 with UNION¶
In [29]:
cur = conn.cursor()
cur.execute(f"EXPLAIN (FORMAT JSON) {q1}")
In [30]:
plans = cur.fetchone()[0]
plans
Out[30]:
[{'Plan': {'Node Type': 'Unique',
'Parallel Aware': False,
'Startup Cost': 8.64,
'Total Cost': 8.65,
'Plan Rows': 2,
'Plan Width': 48,
'Plans': [{'Node Type': 'Sort',
'Parent Relationship': 'Outer',
'Parallel Aware': False,
'Startup Cost': 8.64,
'Total Cost': 8.64,
'Plan Rows': 2,
'Plan Width': 48,
'Sort Key': ['airports_data.airport_code', 'airports_data.airport_name'],
'Plans': [{'Node Type': 'Append',
'Parent Relationship': 'Outer',
'Parallel Aware': False,
'Startup Cost': 0.0,
'Total Cost': 8.63,
'Plan Rows': 2,
'Plan Width': 48,
'Subplans Removed': 0,
'Plans': [{'Node Type': 'Seq Scan',
'Parent Relationship': 'Member',
'Parallel Aware': False,
'Relation Name': 'airports_data',
'Alias': 'airports_data',
'Startup Cost': 0.0,
'Total Cost': 4.3,
'Plan Rows': 1,
'Plan Width': 48,
'Filter': "(timezone = 'Europe/Moscow'::text)"},
{'Node Type': 'Seq Scan',
'Parent Relationship': 'Member',
'Parallel Aware': False,
'Relation Name': 'airports_data',
'Alias': 'airports_data_1',
'Startup Cost': 0.0,
'Total Cost': 4.3,
'Plan Rows': 1,
'Plan Width': 48,
'Filter': "(timezone = 'Asia/Yekaterinburg'::text)"}]}]}]}}]
In [34]:
for plan in plans:
print(extract_node_types(plan['Plan']))
['Unique', 'Sort', 'Append', 'Seq Scan', 'Seq Scan']
Query 2 with WHERE¶
In [35]:
cur = conn.cursor()
cur.execute(f"EXPLAIN (FORMAT JSON) {q2}")
In [36]:
plans = cur.fetchone()[0]
plans
Out[36]:
[{'Plan': {'Node Type': 'Sort',
'Parallel Aware': False,
'Startup Cost': 4.31,
'Total Cost': 4.31,
'Plan Rows': 2,
'Plan Width': 48,
'Sort Key': ['airport_code'],
'Plans': [{'Node Type': 'Seq Scan',
'Parent Relationship': 'Outer',
'Parallel Aware': False,
'Relation Name': 'airports_data',
'Alias': 'airports_data',
'Startup Cost': 0.0,
'Total Cost': 4.3,
'Plan Rows': 2,
'Plan Width': 48,
'Filter': "(timezone = ANY ('{Europe/Moscow,Asia/Yekaterinburg}'::text[]))"}]}}]
In [37]:
for plan in plans:
print(extract_node_types(plan['Plan']))
['Sort', 'Seq Scan']