First let's talk about contiguity. I think that you should value compactness over contiguity; after all, the Americans have proven that anything can be contiguous if you believe it hard enough. Contiguity is a "hard problem" to represent and compactness is an "easy problem" to represent. Scalability becomes a challenge because you aren't performing a simple similarity clustering (e.g. k-means); you're performing clustering with much more complex criteria. This is not my specialty, and so in my demonstration I show a simplification that is stochastically correct: assuming player skill and position are all uniformly distributed, a random selection of team centroid based on population density that is also of uniform distribution will represent reality (especially if you run multiple outer iterations with random perturbation). Specifically, if you have time to burn and want to refine the solution, you can wrap this approach in a differential evolution loop with the team centroid as the evolved parameter.
Speaking of complex criteria - this system is only useful if you construct correct teams, and correct teams require position criteria. I present a matrix that you can adjust to set lower and upper bounds on the required number of positions per team.
It is possible - but potentially very slow - to tell an optimizer to narrow the range of allowed skill levels. You may be best off to perform this as a pre-solve step, eliminating potential players that are some selected number of standard deviations away from your target skill level. Either that, or represent fairness not as an LP objective but as an LP constraint, with bounds on the minimum and maximum skill sum per team. I demonstrate the latter; but note that this is most feasible when we constrain the mean team skill toward the mean skill overall (in this case 50). Defining a range to be too narrow (less than ~ 20%) or defining the target to be far away from the mean make solution difficult.
All together,
import geopandas
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pulp
import shapely
from numpy.random import default_rng, Generator
n_teams = 64
players_per_team = 11
total_players = n_teams * players_per_team
max_skill = 100
def synthesize_world_data(rand: Generator) -> tuple[
geopandas.GeoDataFrame, # player and country data
pd.DataFrame, # field position data
]:
# Load the default world dataset from geopandas,
# dropping a couple of countries with degenerate bounds.
# There are better ways to fix this (not shown here).
world = (
geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
.rename(columns={'name': 'country'})
)
world.index.name = 'country'
world[['left', 'bottom', 'right', 'top']] = shapely.bounds(world['geometry'])
is_degenerate = np.isclose(world['right'] - world['left'], 360)
world = world[~is_degenerate]
# Assume that talent is uniformly distributed throughout the world,
# so players are spatially uniformly distributed. Draw from a pool
# of players ~2 times what is needed for final team formation.
pop_ratio = 2 * total_players / world['pop_est'].sum()
world['players_est'] = world['pop_est'] * pop_ratio
# Hack: perform an inefficient but statistically correct spatial random distribution of players
# within their countries by allocating random coordinates within rectilinear bounds and then doing
# geometric elimination of those players that have decided to live in the ocean
world['geodensity'] = shapely.area(world['geometry'])/(world['right'] - world['left'])/(world['top'] - world['bottom'])
n_soggy_players = (world['players_est'] / world['geodensity']).round().astype(int)
soggy_allocated = pd.DataFrame(
np.arange(n_soggy_players.max()) < n_soggy_players.values[:, np.newaxis],
index=n_soggy_players.index,
columns=pd.RangeIndex(name='country_player', start=0, stop=n_soggy_players.max()),
).replace(False, np.nan).stack()
world, _ = world.align(soggy_allocated, axis=0)
world['player_lon'] = rand.uniform(low=world['left'], high=world['right'])
world['player_lat'] = rand.uniform(low=world['bottom'], high=world['top'])
not_soggy = shapely.contains_xy(world['geometry'], world[['player_lon', 'player_lat']])
world = world[not_soggy]
# Assume that skill and position are uniformly distributed
# https://en.wikipedia.org/wiki/Association_football_positions
world['player_skill'] = rand.uniform(low=0, high=max_skill, size=len(world))
positions = pd.DataFrame(
{
'min_players': [1, 2, 2, 2],
'max_players': [1, 5, 5, 5],
},
index=pd.Index(name='name', data=['goalkeep', 'defender', 'midfield', 'forward']),
)
pos_mean = (positions['min_players'] + positions['max_players'])/2
world['position'] = rand.choice(
a=positions.index,
p=pos_mean/pos_mean.sum(),
size=len(world),
)
world.set_index(
pd.RangeIndex(name='player', start=0, stop=len(world)),
append=False, inplace=True,
)
return world, positions
def guess_team_centroids(rand: Generator, world: geopandas.GeoDataFrame) -> tuple[
pd.DataFrame, # centroids per team
pd.Series, # player-centroid distances
]:
coord_fields = ['player_lat', 'player_lon']
centroid_idx = rand.choice(a=world.index, size=n_teams)
centroids = (
world.loc[
centroid_idx,
coord_fields,
]
.rename(columns={'player_lat': 'team_lat', 'player_lon': 'team_lon'})
.set_index(pd.RangeIndex(name='team', start=0, stop=n_teams))
)
coords = np.deg2rad(
pd.merge(
how='cross', left=centroids.reset_index(),
right=world[coord_fields].reset_index(),
).set_index(['team', 'player'])
)
# Haversine norm (how is this not in geopandas?)
a = (
np.sin((coords['player_lat'] - coords['team_lat'])/2)**2 +
np.sin((coords['player_lon'] - coords['team_lon'])/2)**2
* np.cos(coords['team_lat']) * np.cos(coords['player_lat'])
)
c = np.arctan2(np.sqrt(a), np.sqrt(1 - a))
r = (world.crs.ellipsoid.semi_major_metre + world.crs.ellipsoid.semi_minor_metre) * 1e-3
distances = r*c
return centroids, distances
def make_vars(world: geopandas.GeoDataFrame) -> pd.Series:
team_idx = pd.RangeIndex(name='team', start=0, stop=n_teams)
assign_idx = pd.MultiIndex.from_product((team_idx, world.index))
names = assign_idx.to_frame().astype(str)
assigns = (
'asn_t' + names['team'] + '_p' + names['player']
).apply(pulp.LpVariable, cat=pulp.LpBinary)
return assigns
def make_objective(
assigns: pd.Series,
distances: pd.Series,
) -> pulp.LpAffineExpression:
# For compactness, minimize the distance from each player to their team's centroid
return pulp.lpDot(assigns, distances)/total_players/100
def add_constraints(
prob: pulp.LpProblem,
world: geopandas.GeoDataFrame,
positions: pd.DataFrame,
assigns: pd.Series,
) -> None:
for team, group in assigns.groupby('team'):
# There must be 11 players per team
prob.addConstraint(
name=f'teamsize_t{team}',
constraint=pulp.lpSum(group) == players_per_team,
)
# Enforce competitive team skill sum
skill_deviation = pulp.lpDot(
group,
world.loc[group.index.get_level_values('player'), 'player_skill'],
)/players_per_team - max_skill/2
prob.addConstraint(
name=f'skill_lo_t{team}',
constraint=skill_deviation >= -10,
)
prob.addConstraint(
name=f'skill_hi_t{team}',
constraint=skill_deviation <= 10,
)
# Each player may only be assigned up to one team
for player, group in assigns.groupby('player'):
prob.addConstraint(
name=f'playerexcl_p{player}',
constraint=pulp.lpSum(group) <= 1,
)
# Enforce the team position bounds
for position, (tmin, tmax) in positions.iterrows():
pos_players = world.index[world['position'] == position]
pos_assigns = assigns.loc[(slice(None), pos_players)]
for team, group in pos_assigns.groupby('team'):
total = pulp.lpSum(group)
prob.addConstraint(
name=f'poslo_t{team}_{position}',
constraint=total >= tmin,
)
prob.addConstraint(
name=f'poshi_t{team}_{position}',
constraint=total <= tmax,
)
def solve(prob: pulp.LpProblem, world: geopandas.GeoDataFrame, assigns: pd.Series) -> geopandas.GeoDataFrame:
prob.solve()
if prob.status != pulp.LpStatusOptimal:
raise ValueError('Solution status', prob.status)
assigns = (
assigns.apply(pulp.LpVariable.value)
.astype(int)
.unstack(level='team')
)
team_player_idx, team_idx = assigns.values.nonzero()
world.loc[team_player_idx, 'team'] = team_idx
return world
def dump_solution(
world: geopandas.GeoDataFrame,
) -> None:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
world = world.loc[
world['team'].notna(),
['continent', 'country', 'team', 'position', 'player_skill'],
]
print('Players by country:')
print(world.sort_values(['continent', 'country', 'team', 'position']), end='\n\n')
print('Players by team and position:')
print(world.sort_values(['team', 'position']), end='\n\n')
print('Team summary:')
grouped = world.groupby('team')
teams = world.groupby(['team', 'position']).country.count().unstack('position')
teams.insert(loc=0, column='continent', value=grouped['continent'].agg(pd.Series.mode).astype(str))
teams.insert(loc=1, column='country', value=grouped['country'].agg(pd.Series.mode).astype(str))
teams.insert(loc=2, column='skill', value=grouped['player_skill'].sum() / players_per_team)
teams.sort_values(['continent', 'country', 'skill'], inplace=True)
print(teams, end='\n\n')
def plot_solution(world: geopandas.GeoDataFrame) -> plt.Figure:
fig, ax = plt.subplots()
world['geometry'].boundary.plot(ax=ax)
for team, group in world.groupby('team'):
ax.scatter(group['player_lon'], group['player_lat'])
return fig
def main() -> None:
rand = default_rng(seed=0)
print('Synthesizing world data...')
world, positions = synthesize_world_data(rand)
centroids, distances = guess_team_centroids(rand, world)
print('Making assignment variables...')
assigns = make_vars(world)
prob = pulp.LpProblem(name='football_clustering', sense=pulp.LpMinimize)
print('Defining objective...')
prob.objective = make_objective(assigns, distances)
print('Adding constraints...')
add_constraints(prob, world, positions, assigns)
# print(prob)
print('Solving...')
world = solve(prob, world, assigns)
dump_solution(world)
plot_solution(world)
plt.show()
if __name__ == '__main__':
main()
...
Team summary:
position continent country skill defender forward goalkeep midfield
team
6.0 Africa Algeria 44.963361 3 2 1 5
2.0 Africa Burkina Faso 45.802411 5 3 1 2
56.0 Africa Dem. Rep. Congo 47.018698 3 2 1 5
10.0 Africa Ethiopia 40.294830 3 4 1 3
11.0 Africa Ethiopia 43.277617 2 5 1 3
38.0 Africa Madagascar 59.876627 3 2 1 5
54.0 Africa Morocco 55.331606 3 2 1 5
63.0 Africa Nigeria 40.088676 4 3 1 3
55.0 Africa South Africa 55.269027 3 4 1 3
62.0 Africa Uganda 47.396431 3 2 1 5
36.0 Africa Uganda 48.577035 2 5 1 3
35.0 Africa ['Burundi' 'Dem. Rep. Congo' 'Tanzania'] 53.517437 3 4 1 3
20.0 Asia Bangladesh 57.757983 4 2 1 4
22.0 Asia Bangladesh 58.492236 5 3 1 2
3.0 Asia China 40.099363 2 4 1 4
9.0 Asia China 40.716526 3 2 1 5
34.0 Asia China 41.488606 4 2 1 4
53.0 Asia China 42.986541 4 4 1 2
7.0 Asia China 43.353136 2 5 1 3
49.0 Asia China 44.045816 3 2 1 5
1.0 Asia China 45.908339 3 3 1 4
52.0 Asia China 49.910424 5 3 1 2
40.0 Asia China 50.852176 3 2 1 5
14.0 Asia China 53.105643 3 2 1 5
15.0 Asia China 53.245060 4 2 1 4
23.0 Asia China 57.018255 4 2 1 4
45.0 Asia China 58.395190 3 2 1 5
57.0 Asia China 59.197112 2 3 1 5
26.0 Asia China 59.516640 3 4 1 3
21.0 Asia China 59.535480 4 3 1 3
44.0 Asia India 41.307787 2 5 1 3
48.0 Asia India 41.448672 2 5 1 3
37.0 Asia India 41.713956 2 3 1 5
5.0 Asia India 44.313757 2 5 1 3
19.0 Asia India 47.688853 2 5 1 3
39.0 Asia India 53.535032 4 4 1 2
43.0 Asia India 54.314630 2 3 1 5
50.0 Asia India 56.103479 2 5 1 3
0.0 Asia India 56.155755 2 3 1 5
59.0 Asia India 56.180263 4 4 1 2
17.0 Asia India 58.247667 2 5 1 3
16.0 Asia India 58.393727 2 4 1 4
31.0 Asia India 58.884043 3 3 1 4
41.0 Asia India 59.004900 2 3 1 5
12.0 Asia India 59.838218 2 4 1 4
28.0 Asia Indonesia 44.747853 4 4 1 2
27.0 Asia Indonesia 53.665286 4 2 1 4
60.0 Asia Japan 44.237347 4 2 1 4
4.0 Asia Pakistan 41.385745 2 5 1 3
58.0 Asia Philippines 42.990101 2 3 1 5
42.0 Asia Saudi Arabia 42.746346 4 2 1 4
32.0 Asia Turkey 49.816575 3 4 1 3
51.0 Asia ['China' 'Taiwan'] 42.435033 2 4 1 4
47.0 Asia ['Pakistan' 'Tajikistan'] 45.447561 4 2 1 4
25.0 Europe France 53.116500 5 2 1 3
30.0 Europe Germany 40.177276 3 5 1 2
13.0 Europe Italy 46.469427 3 2 1 5
61.0 Europe Poland 46.816901 3 5 1 2
33.0 Europe Romania 58.854495 3 3 1 4
29.0 North America Mexico 49.474287 2 5 1 3
46.0 North America Mexico 55.503530 3 2 1 5
24.0 North America United States of America 40.777742 2 4 1 4
8.0 South America Brazil 55.562492 5 2 1 3
18.0 South America ['Brazil' 'Venezuela'] 53.190197 3 5 1 2