Data Modeling¶
Normalization¶
When to Denormalize¶
Entity Relationship Modeling¶
Relationships¶
E-commerce Schema Example¶
-- Core entities
CREATE TABLE customers (
id BIGSERIAL PRIMARY KEY,
email VARCHAR(255) UNIQUE NOT NULL,
name VARCHAR(255) NOT NULL,
created_at TIMESTAMP DEFAULT NOW()
);
CREATE TABLE products (
id BIGSERIAL PRIMARY KEY,
sku VARCHAR(50) UNIQUE NOT NULL,
name VARCHAR(255) NOT NULL,
price DECIMAL(10,2) NOT NULL,
category_id BIGINT REFERENCES categories(id),
created_at TIMESTAMP DEFAULT NOW()
);
CREATE TABLE orders (
id BIGSERIAL PRIMARY KEY,
order_number VARCHAR(50) UNIQUE NOT NULL,
customer_id BIGINT NOT NULL REFERENCES customers(id),
status VARCHAR(20) NOT NULL DEFAULT 'pending',
total DECIMAL(12,2) NOT NULL,
shipping_address_id BIGINT REFERENCES addresses(id),
created_at TIMESTAMP DEFAULT NOW(),
-- Indexes for common queries
CONSTRAINT valid_status CHECK (status IN ('pending', 'confirmed', 'shipped', 'delivered', 'cancelled'))
);
CREATE INDEX idx_orders_customer ON orders(customer_id);
CREATE INDEX idx_orders_status ON orders(status);
CREATE INDEX idx_orders_created ON orders(created_at DESC);
-- Junction table with attributes
CREATE TABLE order_items (
id BIGSERIAL PRIMARY KEY,
order_id BIGINT NOT NULL REFERENCES orders(id),
product_id BIGINT NOT NULL REFERENCES products(id),
quantity INT NOT NULL CHECK (quantity > 0),
unit_price DECIMAL(10,2) NOT NULL, -- Denormalized (price at time of order)
UNIQUE(order_id, product_id)
);
-- Polymorphic association (addresses for multiple entities)
CREATE TABLE addresses (
id BIGSERIAL PRIMARY KEY,
addressable_type VARCHAR(50) NOT NULL, -- 'customer', 'order'
addressable_id BIGINT NOT NULL,
address_type VARCHAR(20), -- 'billing', 'shipping'
line1 VARCHAR(255) NOT NULL,
city VARCHAR(100) NOT NULL,
postal_code VARCHAR(20) NOT NULL,
country VARCHAR(2) NOT NULL,
INDEX idx_addressable (addressable_type, addressable_id)
);
NoSQL Data Modeling¶
Document Model (MongoDB)¶
// Embedded (denormalized) - for 1:few, frequently accessed together
{
"_id": "order_123",
"customer": {
"id": "cust_456",
"name": "John Doe",
"email": "[email protected]" // Denormalized
},
"items": [
{
"product_id": "prod_789",
"name": "Laptop", // Denormalized
"quantity": 1,
"price": 999.99
}
],
"shipping_address": {
"line1": "123 Main St",
"city": "NYC",
"country": "US"
},
"status": "pending",
"total": 999.99,
"created_at": ISODate("2024-01-15T10:00:00Z")
}
// Referenced - for 1:many, large documents, frequently updated
// Order document
{
"_id": "order_123",
"customer_id": "cust_456", // Reference
"item_ids": ["item_1", "item_2"], // References
"status": "pending"
}
// Guidelines:
// Embed when:
// - Data is read together
// - "Contains" relationship
// - Few child documents
// - Children don't change often
// Reference when:
// - Data accessed independently
// - Many child documents (unbounded)
// - Children frequently updated
// - Many-to-many relationships
Wide-Column Model (Cassandra)¶
Key-Value Model (DynamoDB)¶
Time-Series Data Modeling¶
-- Partitioned by time for efficient queries and retention
CREATE TABLE metrics (
metric_name VARCHAR(100) NOT NULL,
tags JSONB,
timestamp TIMESTAMPTZ NOT NULL,
value DOUBLE PRECISION NOT NULL,
PRIMARY KEY (metric_name, timestamp)
) PARTITION BY RANGE (timestamp);
-- Create partitions
CREATE TABLE metrics_2024_01 PARTITION OF metrics
FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');
-- TimescaleDB hypertable (automatic partitioning)
SELECT create_hypertable('metrics', 'timestamp',
chunk_time_interval => INTERVAL '1 day');
-- Continuous aggregates for rollups
CREATE MATERIALIZED VIEW metrics_hourly
WITH (timescaledb.continuous) AS
SELECT
metric_name,
time_bucket('1 hour', timestamp) AS hour,
AVG(value) AS avg_value,
MAX(value) AS max_value,
MIN(value) AS min_value,
COUNT(*) AS sample_count
FROM metrics
GROUP BY metric_name, time_bucket('1 hour', timestamp);
-- Retention policy
SELECT add_retention_policy('metrics', INTERVAL '90 days');
Graph Data Modeling¶
Schema Evolution¶
Relational Schema Migration¶
-- Safe migration patterns
-- Add column (backward compatible)
ALTER TABLE orders ADD COLUMN notes TEXT;
-- Add column with default
ALTER TABLE orders ADD COLUMN priority INT DEFAULT 0;
-- Rename column (two-step deployment)
-- Step 1: Add new column, backfill, write to both
ALTER TABLE orders ADD COLUMN order_total DECIMAL(12,2);
UPDATE orders SET order_total = total;
-- Application writes to both columns
-- Step 2: After all code deployed
ALTER TABLE orders DROP COLUMN total;
-- Add index concurrently (no lock)
CREATE INDEX CONCURRENTLY idx_orders_status ON orders(status);
-- Change column type (careful!)
-- Option 1: Add new column, migrate, drop old
-- Option 2: If compatible, direct alter
ALTER TABLE orders ALTER COLUMN status TYPE VARCHAR(50);
Document Schema Evolution¶
// MongoDB schema versioning
{
"_id": "doc_123",
"schema_version": 2,
"name": "Product",
// v2 fields
"price": {
"amount": 1999,
"currency": "USD"
}
}
// Migration on read (lazy)
function migrateDocument(doc) {
if (doc.schema_version === 1) {
// Migrate v1 to v2
doc.price = {
amount: doc.price_cents,
currency: "USD"
};
delete doc.price_cents;
doc.schema_version = 2;
}
return doc;
}
// Or use MongoDB's $addFields in aggregation
db.products.aggregate([
{
$addFields: {
price: {
$cond: {
if: { $eq: ["$schema_version", 1] },
then: { amount: "$price_cents", currency: "USD" },
else: "$price"
}
}
}
}
]);
Common Interview Questions¶
- When to normalize vs denormalize?
- Normalize: Write-heavy, data integrity critical
-
Denormalize: Read-heavy, performance critical
-
Document vs Relational model?
- Document: Flexible schema, nested data, read together
-
Relational: Strong consistency, complex queries, relationships
-
How to model many-to-many?
- Relational: Junction table
-
Document: Array of references or embedded if bounded
-
Cassandra data modeling principles?
- Design for queries, not relationships
- Denormalize heavily
-
One table per query pattern
-
Schema evolution strategies?
- Add columns with defaults
- Two-phase migrations
- Version documents for NoSQL