What's the Most Forgotten State?
import pandas as pd
import matplotlib as mpl
import requests
import lxml.html
import itertools
from StringIO import StringIO
import mplstyle
import mplstyle.styles.simple
mplstyle.set(mplstyle.styles.simple)
mplstyle.set({
"figure.figsize": (10, 8),
"lines.markersize": 0
})
from IPython.display import HTML
from markdown import markdown
md = lambda x: HTML(markdown(x))
A friend recently challenged me to name all 50 U.S. states. I named 49, forgetting Michigan. Which made me wonder: What's the most forgotten state? And what's the most memorable?
QUIZ_RESULTS_URL = "http://www.sporcle.com/games/g/states/results"
results_html = requests.get(QUIZ_RESULTS_URL).text
results_dom = lxml.html.fromstring(results_html)
times_played_str = results_dom.cssselect(".dividerText strong")[0].text
answer_tables = results_dom.cssselect("table.answer")
result_rows_nested = map(lambda x: x.cssselect("tr")[1:], answer_tables)
result_rows = list(itertools.chain.from_iterable(result_rows_nested))
def data_from_row(tr):
return {
"answer": tr.cssselect("td.answer a")[0].text,
"pct_correct": float(tr.cssselect("td.percent")[0].text[:-1])
}
sporcle_results = pd.DataFrame(map(data_from_row, result_rows))
md(u"""The answer, according to [Sporcle's "US States Quiz"](http://www.sporcle.com/games/g/states/results)
— which, the last time I checked, has been played %s times — is __%s__.""" % (
times_played_str,
sporcle_results.irow(-1)["answer"]
))
The answer, according to Sporcle's "US States Quiz" — which, the last time I checked, has been played 10,983,146 times — is Missouri.
Because of selection bias among people who decide to play the quiz, and the fact that people often retake these types of quizzes, it's hard to say just how forgotten Missouri is. But it's certainly the most forgotten among people who've taken that quiz.
Even so, you'd expect larger states to be, generally, more memorable than smaller states. Using U.S. Census population data for 2010, we can find out which states are punching above their weight. The simplest, crudest approach is to subtract states' Sporcle rank from their population rank. By this measure:
CENSUS_URL = "http://www.census.gov/popest/data/national/totals/2012/files/NST_EST2012_ALLDATA.csv"
census_csv = requests.get(CENSUS_URL).text
populations = pd.read_csv(StringIO(census_csv))
# The Census data also includes population figures for
# the nation at-large, as well as regional groupings of states.
#
# Let's select just the states.
in_sporcle = populations["Name"].apply(lambda x: x in sporcle_results["answer"].values)
state_populations = populations[in_sporcle]
state_populations["Pop. Rank"] = state_populations["CENSUS2010POP"].rank(ascending=False)
sporcle_results["Sporcle Rank"] = sporcle_results["pct_correct"].rank(ascending=False)
rankings = sporcle_results.merge(
state_populations[["Name", "Pop. Rank"]],
how="left",
left_on="answer",
right_on="Name")[["answer", "Sporcle Rank", "Pop. Rank"]]
rankings["diff"] = rankings["Pop. Rank"] - rankings["Sporcle Rank"]
def get_state(rows, name):
return rows[rows["answer"] == name].irow(0).to_dict()
def rank_to_string(x):
mod_10 = int(x) % 10
ths = [ "th" for i in range(6) ]
if x not in [ 11, 12, 13 ]:
suffix = (["th", "st", "nd", "rd"] + ths)[mod_10]
else:
suffix = ths[mod_10]
return "%d%s" % (x, suffix)
missouri = get_state(rankings, "Missouri")
md(u"""- __Missouri still does terribly__, ranking %s in population,
for a difference of __%d__.""" % (rank_to_string(missouri["Pop. Rank"]), missouri["diff"]))
- Missouri still does terribly, ranking 18th in population, for a difference of -32.
top_state = rankings.sort("diff", ascending=False).irow(0).to_dict()
md(u"""- __%s is the most over-performing state__, ranking %s in population but
%s in the Sporcle quiz, for a difference of __+%d__.""" % (
top_state["answer"],
rank_to_string(top_state["Pop. Rank"]),
rank_to_string(top_state["Sporcle Rank"]),
top_state["diff"])
)
- North Dakota is the most over-performing state, ranking 48th in population but 6th in the Sporcle quiz, for a difference of +42.
You can see the full list below. The over-performing states tend to have memorable quirks to them. In the top five, you have the two states outside the Lower 48, a North-South pairing, and the state furthest northeast. Only one of the bottom five touches an ocean.
rankings.sort("diff", ascending=False).reset_index(drop=True)
answer | Sporcle Rank | Pop. Rank | diff | |
---|---|---|---|---|
0 | North Dakota | 6.0 | 48 | 42.0 |
1 | Alaska | 7.5 | 47 | 39.5 |
2 | South Dakota | 7.5 | 46 | 38.5 |
3 | Hawaii | 12.0 | 40 | 28.0 |
4 | Maine | 14.0 | 41 | 27.0 |
5 | Nevada | 10.5 | 35 | 24.5 |
6 | New Mexico | 13.0 | 36 | 23.0 |
7 | Idaho | 21.0 | 39 | 18.0 |
8 | Rhode Island | 27.5 | 43 | 15.5 |
9 | South Carolina | 10.5 | 24 | 13.5 |
10 | Montana | 31.0 | 44 | 13.0 |
11 | Vermont | 38.0 | 49 | 11.0 |
12 | Wyoming | 39.5 | 50 | 10.5 |
13 | Washington | 3.5 | 13 | 9.5 |
14 | West Virginia | 27.5 | 37 | 9.5 |
15 | Utah | 25.5 | 34 | 8.5 |
16 | Alabama | 15.0 | 23 | 8.0 |
17 | Mississippi | 24.0 | 31 | 7.0 |
18 | New Hampshire | 37.0 | 42 | 5.0 |
19 | Oregon | 22.0 | 27 | 5.0 |
20 | Delaware | 42.0 | 45 | 3.0 |
21 | North Carolina | 9.0 | 10 | 1.0 |
22 | Texas | 1.5 | 2 | 0.5 |
23 | New York | 3.5 | 3 | -0.5 |
24 | California | 1.5 | 1 | -0.5 |
25 | Florida | 5.0 | 4 | -1.0 |
26 | Kansas | 34.5 | 33 | -1.5 |
27 | Arizona | 18.0 | 16 | -2.0 |
28 | Colorado | 25.5 | 22 | -3.5 |
29 | Virginia | 16.0 | 12 | -4.0 |
30 | Arkansas | 36.0 | 32 | -4.0 |
31 | Louisiana | 30.0 | 25 | -5.0 |
32 | Kentucky | 33.0 | 26 | -7.0 |
33 | Georgia | 17.0 | 9 | -8.0 |
34 | Nebraska | 47.0 | 38 | -9.0 |
35 | Michigan | 19.5 | 8 | -11.5 |
36 | Oklahoma | 39.5 | 28 | -11.5 |
37 | New Jersey | 23.0 | 11 | -12.0 |
38 | Ohio | 19.5 | 7 | -12.5 |
39 | Iowa | 43.0 | 30 | -13.0 |
40 | Tennessee | 32.0 | 17 | -15.0 |
41 | Connecticut | 48.0 | 29 | -19.0 |
42 | Illinois | 29.0 | 5 | -24.0 |
43 | Maryland | 44.0 | 19 | -25.0 |
44 | Wisconsin | 45.0 | 20 | -25.0 |
45 | Indiana | 41.0 | 15 | -26.0 |
46 | Minnesota | 49.0 | 21 | -28.0 |
47 | Pennsylvania | 34.5 | 6 | -28.5 |
48 | Massachusetts | 46.0 | 14 | -32.0 |
49 | Missouri | 50.0 | 18 | -32.0 |
In the slopegraph below, you can get a more visceral, visual sense of state forgettableness. States with a absolute rank-difference less than 10 are in gray. Of states with larger differences, over-performers are in blue, while underperformers are in red.
slope = rankings.set_index("answer")[["Pop. Rank", "Sporcle Rank"]]\
.sort("Pop. Rank", ascending=False)\
.apply(lambda x: 51 - x)
# Make the chart more vertical
prev_styles = mplstyle.get()
mplstyle.set({ "figure.figsize": (6, 12) })
# Color the lines based on direction and magnitude
color_cycle = ["gray"] + mplstyle.get("axes.color_cycle")
pos_neg = (slope["Pop. Rank"] < slope["Sporcle Rank"])
major = (slope["Pop. Rank"] - slope["Sporcle Rank"]).apply(abs) > 9
colors = (major * (pos_neg + 1))\
.apply(lambda x: color_cycle[int(x)])
# Plot the slopegraph
ax = slope.T.plot(
colors=colors,
alpha=0.7,
legend=False,
xticks=[0,1],
ylim=(0, 51))
intflo = lambda x: x if x - int(x) else int(x)
yticks = [ x + 1 for x in range(50) ]
# Set the left y-axis
ax.set_yticks(yticks)
ax.set_yticklabels([ "%s (%s)" % (
slope.index[i],
51 - intflo(slope["Pop. Rank"][i])
) for i in range(50) ])
# Set the right y-axis
ax2 = ax.twinx()
ax2.set_yticks(yticks)
ax2.set_ylim((0, 51))
ax2.set_yticklabels([ "%s (%s)" % (
slope.sort("Sporcle Rank").index[i],
51 - intflo(slope.sort("Sporcle Rank")["Sporcle Rank"][i])
) for i in range(50) ])
mplstyle.set(prev_styles)
pass
Updates:
- 29 Aug. 2013: Fixed erroneous claim that none of the five most underperforming states touch an ocean.