Do Androids Dream of Electric Blue?
Paint colors always have such fanciful names like “Flamingo’s Dream” and “Agreeable Gray”. Can we teach a computer to invent new colors and give them fitting names? Let’s give it a shot!
Corpus Colorum
First, let’s gather some paint color information from existing brands.
Digging through the sources on color explorers for Benjamin Moore, Sherwin-Williams, and Behr, I was able to find some JSON endpoints that could give us names, RGB values, and color families for all of their currently available colors. There’s some other information (e.g. “color collection”, “goes great with”) that might be fun to play around with, but for now we’ll just grab this simple information, e.g.:
{
'name': 'SYLVAN MIST',
'rgb': (184, 199, 191),
'family': 'BLUE',
}
If data gathering and cleaning sounds boring to you, skip ahead to Exploring the Corpus.
We’ll define a few utility functions as we go along to help us homogenize the data.
ALLOWED_SPECIAL_CHARS = set(' \'"-.,&?')
def clean_name(n):
"""Strip superfluous characters and convert to uppercase"""
return ''.join([
c for c in n.upper()
if c.isalnum() or c in ALLOWED_SPECIAL_CHARS])
The Benjamin Moore API gives us data pretty close to what we need. We’ll just have to convert the RGB values from hexadecimal and clean up the names.
import struct
import pandas as pd
import requests
def rgb_from_hex(h):
"""Convert hex string (e.g. "00FF00") to tuple of RGB values (e.g. (0, 255, 0))"""
return struct.unpack('BBB', bytes.fromhex(h))
def load_benjamin_moore():
data = requests.get('https://www.benjaminmoore.com/api/colors').json()
df = pd.DataFrame(list(data['colors'].values()),
columns=['name', 'family', 'hex'])
df[['name', 'family']] = df[['name', 'family']].apply(lambda x: x.apply(clean_name))
df['rgb'] = df['hex'].apply(rgb_from_hex)
return df.drop('hex', axis=1)
benjamin_moore_colors = load_benjamin_moore()
len(benjamin_moore_colors)
4221
benjamin_moore_colors.sample()
name | family | rgb | |
---|---|---|---|
3378 | WORN LEATHER SHOES | NEUTRAL | (152, 142, 120) |
Sherwin Williams is also close, but for some reason the RGB values are given as a single integer.
def rgb_from_dec(d):
"""Convert integer (e.g. 65280) to tuple of RGB values (e.g. (0, 255, 0))"""
return rgb_from_hex(f'{d:06x}')
def load_sherwin_williams():
data = requests.get('https://www.sherwin-williams.com/color-visualization/services/color/SW/all').json()
df = pd.DataFrame(data, columns=['name', 'colorFamilyNames', 'rgb'])
df['name'] = df['name'].apply(clean_name)
df['family'] = df['colorFamilyNames'].apply(lambda x: clean_name(x[0]))
df['rgb'] = df['rgb'].apply(rgb_from_dec)
return df.drop('colorFamilyNames', axis=1)
sherwin_williams_colors = load_sherwin_williams()
len(sherwin_williams_colors)
1746
sherwin_williams_colors.sample()
name | family | rgb | |
---|---|---|---|
190 | AMBITIOUS AMBER | ORANGE | (240, 203, 151) |
Behr is a bit tricky as the data is inside of a JavaScript source file instead of a JSON endpoint. Also, the color family data is stored separately from the color information, so we’ll have to join the two together.
import itertools
def get_data_list(js_source):
"""
Extract the JSON string representing a list from a JavaScript
source file of the form 'var data = [ ... ];'
"""
return js_source[js_source.find('[') : js_source.rfind(']') + 1]
def flatten_groups(groups):
"""
For some reason, groups are stored as a list of lists of strings
(which are themselves comma-separated lists of color IDs). Flatten
this into a single list of unique color IDs.
For example, [['a,b,c', 'd,e'], ['f,g', 'h,i,j'], ['k', 'a,b']]
would become ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']
"""
return list(set(
itertools.chain.from_iterable(
x.split(',')
for x in itertools.chain.from_iterable(groups))))
def load_behr():
colors_data = requests.get('http://www.behr.com/mainService/services/colornx/all.js').text
df = pd.read_json(get_data_list(colors_data))
# extract first row as column names
df.columns = df.iloc[0]
df = df.reindex(df.index.drop(0))[['id', 'name', 'rgb']]
family_data = requests.get('http://www.behr.com/mainService/services/xml/families.js').text
family_df = pd.read_json(get_data_list(family_data))
family_df['groups'] = family_df['groups'].apply(flatten_groups)
# explode `groups` column into a column for each value in the list
# e.g. {'name': 'Red', 'groups': ['a', 'b', 'c']}
# becomes {'name': 'Red', '0': 'a', '1': 'b', '2': 'c']}
family_df = pd.concat([family_df['name'],
family_df['groups'].apply(pd.Series)],
axis=1)
# melt group columns into a single column, creating a row from each
# e.g. {'name': 'Red', '0': 'a', '1': 'b', '2': 'c']}
# becomes {'name': 'Red', 'id': 'a'}, {'name': 'Red', 'id': 'b'}, {'name': 'Red', 'id': 'c'}
family_df = pd.melt(family_df,
id_vars=['name'],
value_name='id')[['name', 'id']].dropna()
# join families to colors by ID
df = df.merge(family_df, on='id', suffixes=['_color', '_family'])
df[['name', 'family']] = df[['name_color', 'name_family']].apply(lambda x: x.apply(clean_name))
df['rgb'] = df['rgb'].apply(lambda x: rgb_from_hex(x[1:]))
return df[['name', 'family', 'rgb']]
behr_colors = load_behr()
len(behr_colors)
2891
behr_colors.sample()
name | family | rgb | |
---|---|---|---|
1650 | DRIED CHAMOMILE | YELLOW | (209, 179, 117) |
Now that they’re all in the same format, we can combine them all together.
colors = pd.concat([benjamin_moore_colors,
sherwin_williams_colors,
behr_colors]).drop_duplicates()
len(colors)
8137
There are a bunch of colors with weird family names like ‘Timeless Color’ or ‘Historic Color’. For simplicity, let’s discard these.
COLOR_FAMILIES = set([
'RED', 'ORANGE', 'PINK', 'BROWN', 'NEUTRAL', 'GRAY',
'WHITE', 'YELLOW', 'PURPLE', 'BLUE', 'BLACK', 'GREEN'])
colors = colors[
colors['family'].isin(COLOR_FAMILIES)]
len(colors)
7405
Exploring the Corpus
It would be neat if we could view the colors inline.
from IPython.display import HTML
def display_color(color):
return HTML("""
<div style="width: 128px; display: inline-block">
<p><div style="font-weight: bold">{name}</div>{family}</p>
<p><svg width="64" height="64" style="background: #{rgb_hex}" /></p>
<p>#{rgb_hex}</p>
</div>
""".format(
name=color.name,
family=color.family,
rgb_hex=struct.pack('BBB', *color.rgb).hex()))
def display_colors(colors_df):
return HTML("""
<div>
{colors}
</div>
""".format(
colors = '\n'.join(
display_color(c).data for c in colors_df.itertuples())))
display_colors(colors.sample(5))
#e36841
#d2d6c7
#0087a9
#b9a0b0
#888169
Let’s see which families have the most shades:
colors.groupby('family').count() \
.rename(columns={'name': 'count'}) \
.sort_values('count', ascending=False) \
.reset_index()[['family', 'count']]
family | count | |
---|---|---|
0 | GREEN | 1145 |
1 | BLUE | 995 |
2 | RED | 938 |
3 | ORANGE | 862 |
4 | YELLOW | 766 |
5 | BROWN | 755 |
6 | PURPLE | 650 |
7 | GRAY | 600 |
8 | WHITE | 339 |
9 | NEUTRAL | 287 |
10 | BLACK | 51 |
11 | PINK | 17 |
Who knew there could be 51 Shades of Black?
display_colors(colors[colors['family'] == 'BLACK'].sample(5))
#2f3234
#453f3f
#494c4d
#2a2d2e
#3f4348
There are some colors with more than one name. Let’s take a look at the one with the most names.
most_common_rgb = colors[colors.duplicated(['rgb'], keep=False)] \
.groupby('rgb').count() \
.rename(columns={'name': 'count'}) \
.nlargest(1, ['count']) \
.reset_index().iloc[0]['rgb']
display_colors(colors[colors['rgb'] == most_common_rgb])
#ebe4d0
#ebe4d0
#ebe4d0
#ebe4d0
#ebe4d0
All in the Family
A good start for assigning a name to a random color would be to first figure out to which family it belongs.
We’re going to try a few different classifiers, so let’s wrap them in a similar interface:
from sklearn.utils import shuffle
class ColorFamilyClassifier:
def __init__(self, color_df, train_percent=0.8):
self.shuffled = shuffle(color_df)
data = [
(self.get_features(color), self.get_label(color))
for color in self.shuffled.itertuples()
]
cut_index = round(train_percent * len(data))
self.train_set = data[:cut_index]
self.test_set = data[cut_index:]
self.init_classifier()
def get_features(self, color):
raise NotImplemented
def get_label(self, color):
raise NotImplemented
def init_classifier(self):
raise NotImplemented
def accuracy(self):
raise NotImplemented
def classify(self, color):
raise NotImplemented
Naïve Bayes
Let’s start by seeing how much mileage we can get with a Naïve Bayes classifier using RGB values as features.
import nltk
class NaiveBayesClassifier(ColorFamilyClassifier):
def get_label(self, color):
return color.family
def init_classifier(self):
self.classifier = nltk.NaiveBayesClassifier.train(self.train_set)
def accuracy(self):
return nltk.classify.accuracy(self.classifier, self.test_set)
def classify(self, color):
return self.classifier.classify(self.get_features(color))
class NaiveBayesRGBClassifier(NaiveBayesClassifier):
def get_features(self, color):
return dict(zip(("red", "green", "blue"), color.rgb))
classifier = NaiveBayesRGBClassifier(colors)
classifier.accuracy()
0.26806212018906145
That’s not very good accuracy. Let’s try different colorspaces:
import colorsys
import sys
import husl
from colormath.color_conversions import convert_color
from colormath.color_objects import CMYKColor, LabColor, sRGBColor
# we'll run each classifier multiple times and look at the
# mean and standard deviation over all of the runs
RUNS_PER_CLASSIFIER = 5
class NaiveBayesHSVClassifier(NaiveBayesClassifier):
def get_features(self, color):
hsv = colorsys.rgb_to_hsv(*color.rgb)
return dict(zip(("hue", "saturation", "value"), hsv))
class NaiveBayesHLSClassifier(NaiveBayesClassifier):
def get_features(self, color):
hls = colorsys.rgb_to_hls(*color.rgb)
return dict(zip(("hue", "lightness", "saturation"), hls))
class NaiveBayesHUSLClassifier(NaiveBayesClassifier):
def get_features(self, color):
hsl = husl.rgb_to_husl(*color.rgb)
return dict(zip(("hue", "saturation", "lightness"), hsl))
class NaiveBayesCMYKClassifier(NaiveBayesClassifier):
def get_features(self, color):
rgb = sRGBColor(*color.rgb)
cmyk = convert_color(rgb, CMYKColor)
return dict(zip(("cyan", "magenta", "yellow", "black"), (getattr(cmyk, v) for v in CMYKColor.VALUES)))
class NaiveBayesLabClassifier(NaiveBayesClassifier):
def get_features(self, color):
rgb = sRGBColor(*color.rgb)
lab = convert_color(rgb, LabColor)
return dict(zip(("lightness", "green-red", "blue-yellow"), (getattr(lab, v) for v in LabColor.VALUES)))
results = [
(c.__name__, c(colors).accuracy())
for c in [
NaiveBayesCMYKClassifier, NaiveBayesHLSClassifier, NaiveBayesHSVClassifier,
NaiveBayesHUSLClassifier, NaiveBayesLabClassifier, NaiveBayesRGBClassifier]
for _ in range(RUNS_PER_CLASSIFIER)
]
pd.DataFrame(results, columns=['Classifier', 'Accuracy']) \
.groupby('Classifier') \
.agg({'Accuracy': ['mean', 'std']}) \
.reset_index() \
.sort_values(('Accuracy', 'mean'), ascending=False)
Classifier | Accuracy | ||
---|---|---|---|
mean | std | ||
0 | NaiveBayesCMYKClassifier | 0.403241 | 0.010765 |
2 | NaiveBayesHSVClassifier | 0.370155 | 0.010850 |
1 | NaiveBayesHLSClassifier | 0.360702 | 0.001464 |
5 | NaiveBayesRGBClassifier | 0.262255 | 0.007603 |
4 | NaiveBayesLabClassifier | 0.226604 | 0.011524 |
3 | NaiveBayesHUSLClassifier | 0.225928 | 0.010810 |
~40% still isn’t great. Let’s try a different kind of classifier.
k-Nearest Neighbor
Let’s try k-NN classifiers over these colorspaces and values of N.
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
class KNNColorClassifier(ColorFamilyClassifier):
def __init__(self, color_corpus, train_percent=0.8, n_neighbors=5):
# labels in the KNeighborsClassifier are integers, so we'll create
# a unique integer label for each color family and map both ways for convenience
families = color_corpus['family'].unique()
self.family_map = {f: i for i, f in enumerate(families)}
self.reverse_family_map = {v: k for k, v in self.family_map.items()}
self.n_neighbors = n_neighbors
super().__init__(color_corpus, train_percent)
def get_label(self, color):
return self.family_map[color.family]
def init_classifier(self):
self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
[features, labels] = zip(*self.train_set)
self.classifier.fit(features, labels)
def accuracy(self):
[features, labels] = zip(*self.test_set)
return self.classifier.score(features, labels)
def classify(self, color):
return self.reverse_family_map[
self.classifier.predict(
[self.get_features(color)]
)[0]
]
def get_neighbors(self, color):
return self.shuffled.iloc[[
i for i in self.classifier.kneighbors(
np.array(self.get_features(color)).reshape(1, -1),
return_distance=False
)[0]
]]
# we'll try values of n_neighbors in this range
MIN_N = 1
MAX_N = 20
class KNNRGBClassifier(KNNColorClassifier):
def get_features(self, color):
return color.rgb
class KNNHSVClassifier(KNNColorClassifier):
def get_features(self, color):
return colorsys.rgb_to_hsv(*color.rgb)
class KNNHLSClassifier(KNNColorClassifier):
def get_features(self, color):
return colorsys.rgb_to_hls(*color.rgb)
class KNNHUSLClassifier(KNNColorClassifier):
def get_features(self, color):
return husl.rgb_to_husl(*color.rgb)
class KNNCMYKClassifier(KNNColorClassifier):
def get_features(self, color):
rgb = sRGBColor(*color.rgb)
cmyk = convert_color(rgb, CMYKColor)
return tuple(getattr(cmyk, v) for v in CMYKColor.VALUES)
class KNNLabClassifier(KNNColorClassifier):
def get_features(self, color):
rgb = sRGBColor(*color.rgb)
lab = convert_color(rgb, LabColor)
return tuple(getattr(lab, v) for v in LabColor.VALUES)
results = [
(c.__name__, n, c(colors, n_neighbors=n).accuracy())
for c in [
KNNCMYKClassifier, KNNHLSClassifier, KNNHSVClassifier,
KNNHUSLClassifier, KNNLabClassifier, KNNRGBClassifier]
for n in range(MIN_N, MAX_N + 1)
for _ in range(RUNS_PER_CLASSIFIER)
]
pd.DataFrame(results, columns=['Classifier', 'N', 'Accuracy']) \
.groupby(['Classifier', 'N']) \
.agg({'Accuracy': ['mean', 'std']}) \
.reset_index() \
.sort_values(('Accuracy', 'mean'), ascending=False) \
.head(5)
Classifier | N | Accuracy | ||
---|---|---|---|---|
mean | std | |||
95 | KNNLabClassifier | 16 | 0.742606 | 0.004173 |
93 | KNNLabClassifier | 14 | 0.737205 | 0.006377 |
99 | KNNLabClassifier | 20 | 0.735044 | 0.004930 |
88 | KNNLabClassifier | 9 | 0.734504 | 0.006691 |
96 | KNNLabClassifier | 17 | 0.734099 | 0.006909 |
It looks like the Lab colorspace was the most accurate, and 16 neighbors seems to have slightly outperformed other values in our range.
~74% accuracy should be “good enough” for our purposes. Let’s put this classifier to work!
classifier = KNNLabClassifier(colors, train_percent=1.0, n_neighbors=16)
Testing out the Classifier
Let’s see how far off the classifier is when it’s wrong. Is it pretty close (e.g. classifying an orange as a red or a yellow) or way off (e.g. classifying a blue as a pink)?
classified = pd.DataFrame(
((c.family, classifier.classify(c)) for c in colors.itertuples()),
columns=['Expected', 'Actual'])
totals = classified.groupby(['Expected']).size()
results = classified.groupby(['Expected', 'Actual']).size().reset_index(name='Count')
results['Pct'] = results.apply(lambda x: x['Count'] / totals.loc[x['Expected']], axis=1)
First, let’s see which families the classifier most accurately identifies.
results[results['Expected'] == results['Actual']] \
.sort_values('Pct', ascending=False)
Expected | Actual | Count | Pct | |
---|---|---|---|---|
32 | GREEN | GREEN | 996 | 0.869869 |
69 | RED | RED | 812 | 0.865672 |
5 | BLUE | BLUE | 836 | 0.840201 |
50 | ORANGE | ORANGE | 680 | 0.788863 |
62 | PURPLE | PURPLE | 504 | 0.775385 |
79 | WHITE | WHITE | 262 | 0.772861 |
23 | GRAY | GRAY | 448 | 0.746667 |
87 | YELLOW | YELLOW | 556 | 0.725849 |
11 | BROWN | BROWN | 538 | 0.712583 |
0 | BLACK | BLACK | 35 | 0.686275 |
41 | NEUTRAL | NEUTRAL | 75 | 0.261324 |
Next, we’ll take a look at each family and see which family it is most commonly incorrectly identified as.
results[results['Expected'] != results['Actual']] \
.sort_values('Pct', ascending=False) \
.drop_duplicates('Expected')
Expected | Actual | Count | Pct | |
---|---|---|---|---|
56 | PINK | RED | 14 | 0.823529 |
39 | NEUTRAL | GRAY | 89 | 0.310105 |
2 | BLACK | GRAY | 9 | 0.176471 |
81 | YELLOW | BROWN | 93 | 0.121410 |
47 | ORANGE | BROWN | 66 | 0.076566 |
19 | BROWN | YELLOW | 49 | 0.064901 |
60 | PURPLE | GRAY | 42 | 0.064615 |
7 | BLUE | GREEN | 60 | 0.060302 |
65 | RED | BROWN | 55 | 0.058635 |
21 | GRAY | BLUE | 35 | 0.058333 |
75 | WHITE | NEUTRAL | 15 | 0.044248 |
35 | GREEN | YELLOW | 44 | 0.038428 |
This makes sense as we have very few datapoints in the pink family, neutral and gray have a lot of overlap, and most of the paint colors in the black family are actually gray.
For now let’s just accept this as “good enough” and have some fun. Let’s generate a some random RGB values and take a guess at the family to which it belongs.
import random
def random_colors(i=1):
return pd.DataFrame({
'name': '???',
'family': '???',
'rgb': (
random.randint(0, 255),
random.randint(0, 255),
random.randint(0, 255),
)
} for _ in range(i))
def classify_colors(df):
return df.apply(classifier.classify, axis=1)
new_colors = random_colors(5)
new_colors['family'] = classify_colors(new_colors)
display_colors(new_colors)
#228c42
#42d5f0
#826f9f
#64e599
#4d360b
‘Desert Rose’ by Any Other Name
Now that we can make a decent guess towards the color family for a random RGB value, let’s try to build off of the existing color names for similar colors to create fun new names.
Let’s start by creating our random mystery color.
mystery_color = new_colors.sample(1)
mystery_color['family'] = classify_colors(mystery_color)
display_colors(mystery_color)
#826f9f
Let’s look at the closest named colors.
neighbors = classifier.get_neighbors(mystery_color.iloc[0])
display_colors(neighbors)
#7e6596
#716998
#8c7eaf
#79669e
#8b7eb1
#8f76af
#887ca5
#7b658b
#997ea8
#74688c
#8b7eba
#655f8e
#987ea4
#9487ba
#75769c
#71588d
On your Markov, get set, go!
To generate names for our mystery color, let’s try training a Markov chain on not only the names of these closest colors, but the product of all synonyms of all component words within the names to give us more variety. We’ll limit synonyms by part of speech so the generated names make slightly more sense.
from nltk.corpus import wordnet
import spacy # faster than wordnet for tokenizing and part-of-speech tagging
nlp = spacy.load("en")
# map spaCy POS to WordNet
POS_MAP = {
'ADJ': 'a',
'ADV': 'r',
'NOUN': 'n',
'VERB': 'v',
}
def get_syns(token):
"""get synonyms for a spaCy token"""
synsets = wordnet.synsets(token.orth_, pos=POS_MAP.get(token.pos_))
if synsets:
return itertools.chain.from_iterable(s.lemma_names() for s in synsets)
return [token.orth_]
def explode(color_name):
"""explode a color name into the product of all of its component words' synonyms"""
return set(
' '.join(variant).replace('_', ' ').upper()
for variant in itertools.product(
*(get_syns(token) for token in nlp(color_name.lower()))))
import string
import markovify
def make_markov_model(colors):
return markovify.Text(None, # we're pre-parsing the sentences
parsed_sentences=[
variant.split()
for variant in set(
itertools.chain.from_iterable(
colors['name'].apply(explode).values))
])
def name_color(color):
model = make_markov_model(classifier.get_neighbors(color))
return string.capwords(
model.make_sentence(
# we're generating short names and don't care about overlap with original text
test_output=False, max_words=3))
mystery_color['name'] = name_color(mystery_color.iloc[0])
display_colors(mystery_color)
#826f9f
Not bad for a computer. Let’s try it some more!
new_colors['name'] = new_colors.apply(name_color, axis=1)
display_colors(new_colors)
#228c42
#42d5f0
#826f9f
#64e599
#4d360b
Most of the generated names will be nonsensical (and many also NSFW), but I did come across a few good ones. Here are the highlights:
#6e3281
#b70a1f
#10cffb
#e77616
#8fb0eb
#52382e
#5bfee2
#c44312
#f0d299
#e8c467
#5d04f2
#74b7c5
#84e8e9
#d7b743
Journey’s End (#BAC9D6)
For now it’s time to climb back out of the rabbit hole, but maybe one day we can teach our algorithm about puns (or even just include homophones in addition to synonyms to increase the likelihood of accidental puns).
Thanks for humoring me and go have some fun with computers.