Database Sharding
Definition

Sharding Strategies

Shard Key Selection
// SHARD KEY: The column used to determine shard placement
// GOOD SHARD KEY CHARACTERISTICS:
// 1. High cardinality (many unique values)
// 2. Even distribution
// 3. Immutable (changing key = moving data)
// 4. Frequently used in queries
// EXAMPLE: E-commerce application
// Option 1: Shard by user_id
// ✓ User data isolated to one shard
// ✓ User queries hit single shard
// ✗ Orders for one product spread across all shards
// Option 2: Shard by order_id
// ✓ Orders evenly distributed
// ✗ User's orders spread across all shards
// ✗ User dashboard requires cross-shard query
// Option 3: Compound shard key (user_id, order_date)
// Can efficiently query user's recent orders
// MULTI-TENANT SHARDING
// Shard by tenant_id
// ✓ Tenant data isolation
// ✓ Easy to move entire tenant
// ✓ Per-tenant queries efficient
class ShardRouter {
int getShardForUser(long userId) {
return Math.abs(Long.hashCode(userId)) % numShards;
}
DataSource getDataSource(long userId) {
int shard = getShardForUser(userId);
return shardDataSources.get(shard);
}
// Cross-shard query (expensive!)
List<Order> getAllOrders() {
return shardDataSources.parallelStream()
.flatMap(ds -> queryOrders(ds).stream())
.collect(Collectors.toList());
}
}
Sharding Challenges

Implementation Patterns
// ROUTING LAYER
// Application-level routing
@Service
class UserService {
@Autowired
private Map<Integer, DataSource> shardDataSources;
User getUser(long userId) {
int shard = getShard(userId);
DataSource ds = shardDataSources.get(shard);
return queryUser(ds, userId);
}
private int getShard(long userId) {
return Math.abs(Long.hashCode(userId)) % shardDataSources.size();
}
}
// Proxy-based routing (e.g., Vitess, ProxySQL)
// Application connects to proxy
// Proxy routes to correct shard
// Transparent to application
// GLOBAL TABLES (replicated everywhere)
// For small, rarely-changing lookup data
// - Countries, currencies, config
// Replicated to all shards for local joins
// SCATTER-GATHER QUERY
List<Product> searchProducts(String term) {
return shardDataSources.parallelStream()
.flatMap(ds -> {
try {
return search(ds, term).stream();
} catch (Exception e) {
return Stream.empty(); // Handle shard failure
}
})
.sorted(Comparator.comparing(Product::getRelevance).reversed())
.limit(100)
.collect(Collectors.toList());
}
// SEQUENCE GENERATION (unique IDs across shards)
// Option 1: UUID (no coordination)
// Option 2: Snowflake IDs (timestamp + machine + sequence)
// Option 3: Dedicated sequence service
Alternatives to Sharding

Tips & Tricks
