Data Modeling¶

Normalization¶

Normalization Forms

When to Denormalize¶

Normalization vs Denormalization

Entity Relationship Modeling¶

Relationships¶

Entity Relationships

E-commerce Schema Example¶

-- Core entities
CREATE TABLE customers (
    id BIGSERIAL PRIMARY KEY,
    email VARCHAR(255) UNIQUE NOT NULL,
    name VARCHAR(255) NOT NULL,
    created_at TIMESTAMP DEFAULT NOW()
);

CREATE TABLE products (
    id BIGSERIAL PRIMARY KEY,
    sku VARCHAR(50) UNIQUE NOT NULL,
    name VARCHAR(255) NOT NULL,
    price DECIMAL(10,2) NOT NULL,
    category_id BIGINT REFERENCES categories(id),
    created_at TIMESTAMP DEFAULT NOW()
);

CREATE TABLE orders (
    id BIGSERIAL PRIMARY KEY,
    order_number VARCHAR(50) UNIQUE NOT NULL,
    customer_id BIGINT NOT NULL REFERENCES customers(id),
    status VARCHAR(20) NOT NULL DEFAULT 'pending',
    total DECIMAL(12,2) NOT NULL,
    shipping_address_id BIGINT REFERENCES addresses(id),
    created_at TIMESTAMP DEFAULT NOW(),

    -- Indexes for common queries
    CONSTRAINT valid_status CHECK (status IN ('pending', 'confirmed', 'shipped', 'delivered', 'cancelled'))
);

CREATE INDEX idx_orders_customer ON orders(customer_id);
CREATE INDEX idx_orders_status ON orders(status);
CREATE INDEX idx_orders_created ON orders(created_at DESC);

-- Junction table with attributes
CREATE TABLE order_items (
    id BIGSERIAL PRIMARY KEY,
    order_id BIGINT NOT NULL REFERENCES orders(id),
    product_id BIGINT NOT NULL REFERENCES products(id),
    quantity INT NOT NULL CHECK (quantity > 0),
    unit_price DECIMAL(10,2) NOT NULL,  -- Denormalized (price at time of order)

    UNIQUE(order_id, product_id)
);

-- Polymorphic association (addresses for multiple entities)
CREATE TABLE addresses (
    id BIGSERIAL PRIMARY KEY,
    addressable_type VARCHAR(50) NOT NULL,  -- 'customer', 'order'
    addressable_id BIGINT NOT NULL,
    address_type VARCHAR(20),  -- 'billing', 'shipping'
    line1 VARCHAR(255) NOT NULL,
    city VARCHAR(100) NOT NULL,
    postal_code VARCHAR(20) NOT NULL,
    country VARCHAR(2) NOT NULL,

    INDEX idx_addressable (addressable_type, addressable_id)
);

NoSQL Data Modeling¶

Document Model (MongoDB)¶

// Embedded (denormalized) - for 1:few, frequently accessed together
{
  "_id": "order_123",
  "customer": {
    "id": "cust_456",
    "name": "John Doe",
    "email": "[email protected]"  // Denormalized
  },
  "items": [
    {
      "product_id": "prod_789",
      "name": "Laptop",           // Denormalized
      "quantity": 1,
      "price": 999.99
    }
  ],
  "shipping_address": {
    "line1": "123 Main St",
    "city": "NYC",
    "country": "US"
  },
  "status": "pending",
  "total": 999.99,
  "created_at": ISODate("2024-01-15T10:00:00Z")
}

// Referenced - for 1:many, large documents, frequently updated
// Order document
{
  "_id": "order_123",
  "customer_id": "cust_456",  // Reference
  "item_ids": ["item_1", "item_2"],  // References
  "status": "pending"
}

// Guidelines:
// Embed when:
// - Data is read together
// - "Contains" relationship
// - Few child documents
// - Children don't change often

// Reference when:
// - Data accessed independently
// - Many child documents (unbounded)
// - Children frequently updated
// - Many-to-many relationships

Wide-Column Model (Cassandra)¶

Cassandra Data Modeling

Key-Value Model (DynamoDB)¶

DynamoDB Data Modeling

Time-Series Data Modeling¶

-- Partitioned by time for efficient queries and retention
CREATE TABLE metrics (
    metric_name VARCHAR(100) NOT NULL,
    tags JSONB,
    timestamp TIMESTAMPTZ NOT NULL,
    value DOUBLE PRECISION NOT NULL,

    PRIMARY KEY (metric_name, timestamp)
) PARTITION BY RANGE (timestamp);

-- Create partitions
CREATE TABLE metrics_2024_01 PARTITION OF metrics
    FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');

-- TimescaleDB hypertable (automatic partitioning)
SELECT create_hypertable('metrics', 'timestamp',
    chunk_time_interval => INTERVAL '1 day');

-- Continuous aggregates for rollups
CREATE MATERIALIZED VIEW metrics_hourly
WITH (timescaledb.continuous) AS
SELECT
    metric_name,
    time_bucket('1 hour', timestamp) AS hour,
    AVG(value) AS avg_value,
    MAX(value) AS max_value,
    MIN(value) AS min_value,
    COUNT(*) AS sample_count
FROM metrics
GROUP BY metric_name, time_bucket('1 hour', timestamp);

-- Retention policy
SELECT add_retention_policy('metrics', INTERVAL '90 days');

Graph Data Modeling¶

Graph Data Model

Schema Evolution¶

Relational Schema Migration¶

-- Safe migration patterns

-- Add column (backward compatible)
ALTER TABLE orders ADD COLUMN notes TEXT;

-- Add column with default
ALTER TABLE orders ADD COLUMN priority INT DEFAULT 0;

-- Rename column (two-step deployment)
-- Step 1: Add new column, backfill, write to both
ALTER TABLE orders ADD COLUMN order_total DECIMAL(12,2);
UPDATE orders SET order_total = total;
-- Application writes to both columns

-- Step 2: After all code deployed
ALTER TABLE orders DROP COLUMN total;

-- Add index concurrently (no lock)
CREATE INDEX CONCURRENTLY idx_orders_status ON orders(status);

-- Change column type (careful!)
-- Option 1: Add new column, migrate, drop old
-- Option 2: If compatible, direct alter
ALTER TABLE orders ALTER COLUMN status TYPE VARCHAR(50);

Document Schema Evolution¶

// MongoDB schema versioning
{
  "_id": "doc_123",
  "schema_version": 2,
  "name": "Product",
  // v2 fields
  "price": {
    "amount": 1999,
    "currency": "USD"
  }
}

// Migration on read (lazy)
function migrateDocument(doc) {
  if (doc.schema_version === 1) {
    // Migrate v1 to v2
    doc.price = {
      amount: doc.price_cents,
      currency: "USD"
    };
    delete doc.price_cents;
    doc.schema_version = 2;
  }
  return doc;
}

// Or use MongoDB's $addFields in aggregation
db.products.aggregate([
  {
    $addFields: {
      price: {
        $cond: {
          if: { $eq: ["$schema_version", 1] },
          then: { amount: "$price_cents", currency: "USD" },
          else: "$price"
        }
      }
    }
  }
]);

Common Interview Questions¶

When to normalize vs denormalize?
Normalize: Write-heavy, data integrity critical
Denormalize: Read-heavy, performance critical
Document vs Relational model?
Document: Flexible schema, nested data, read together
Relational: Strong consistency, complex queries, relationships
How to model many-to-many?
Relational: Junction table
Document: Array of references or embedded if bounded
Cassandra data modeling principles?
Design for queries, not relationships
Denormalize heavily
One table per query pattern
Schema evolution strategies?
Add columns with defaults
Two-phase migrations
Version documents for NoSQL