In this part of the tutorial, we continue with exploring the languages scraped from Wikipedia’s most popular languages data dump.
Read Part 1 of this tutorial here. We attach the IPython notebook model-exploration.ipynb here to make aid with the exploration:
In [1]:
# -*- coding: utf-8 -*-
%matplotlib inline
import bz2
import io
import matplotlib.pyplot as plt
import numpy as np
import os
import cPickle as pickle
import sys
import re
import seaborn as sns
from lang_map import code_lang_map
from pandas import DataFrame
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from collections import Counter
Calculate the percentages from a pandas dataframe of letter counts and add the percentages as new columns to the given dataframe¶
In [2]:
def percentages(df):
df2 = df.join(df.div(df['letters_count'], axis='index'), rsuffix='_perc')
percs = [col for col in df2.columns if col.endswith('_perc')]
return df2[percs]
Count the number of times each character occurs in each language and grab the top 2000 from each of them¶
In [3]:
def get_top_letters():
files = os.listdir('articles')
top_letters = []
for f in files:
c = Counter()
for article in parse('articles/'+f):
c['articles_count'] += 1
for letter in article['content']:
c[letter] += 1
c['letters_count'] += 1
d = dict(c.most_common(2000))
d['lang'] = os.path.splitext(f)[0]
top_letters.append(d)
return top_letters
Parse data from the scrubbed wikipedia articles using regex and return a dictionary¶
In [4]:
def parse(filename):
data = ""
# regex pattern for scrubbing extracted wikipedia article
article_rgx = re.compile(
r'<doc id="(?P<id>\d+)" url="(?P<url>[^"]+)" title="(?P<title>[^"]+)">\n(?P<content>.+)\n<\/doc>', re.S | re.U)
with io.open(filename, 'r', encoding='utf8') as f:
for line in f:
#line = line.decode('utf-8')
data += line
if line.count('</doc>'):
m = article_rgx.search(data)
if m:
yield m.groupdict()
data = ""
Load the articles dictionary back from pickle¶
In [5]:
def load_data():
f = open('letters2.pkl', 'rb')
data = pickle.load(f)
f.close()
return data
Save the articles dictionary into pickle¶
In [6]:
def dump_data():
top_letters = get_top_letters()
with open('letters2.pkl', 'wb') as handle:
pickle.dump(top_letters, handle, protocol=2)
In [7]:
data = load_data()
In [8]:
df = DataFrame(data)
df.fillna(0, inplace=True)
df = df.set_index('lang')
In [9]:
df.head(7)
Out[9]:
In [10]:
df3 = percentages(df)
df3.values[np.isnan(df3.values)] = np.median(df3.values[~np.isnan(df3.values)])
df3.head(7)
Out[10]:
In [11]:
num_clusters = 4
palette = sns.color_palette('colorblind', num_clusters)
In [12]:
est = KMeans(num_clusters, max_iter=30000)
est.fit(df3.values)
y_kmeans = est.predict(df3.values)
Run Principal Component Analysis to reduce the number of columns from 10000 to 2¶
In [13]:
pca = PCA(n_components=2)
pca.fit(df3.values)
X_trans = pca.transform(df3.values)
Plot the results¶
In [14]:
plt.scatter(X_trans[:, 0], X_trans[:, 1], c=[palette[y] for y in y_kmeans], s=50)
Out[14]:
Helper function for printing similarity distance details in clusters¶
In [15]:
def print_sim(dist, x, y, langs):
print("{0} ({1}, {2})".format(dist, code_lang_map[langs[x]], code_lang_map[langs[y]]))
Find the languages that are most similar¶
In [16]:
cluster_dfs = {}
cluster_langs = {}
cluster_distances = {}
langs = list(code_lang_map.keys())
In [17]:
for cluster_num in range(4):
indexes = [i for i in range(y_kmeans.shape[0]) if y_kmeans[i] == cluster_num]
cluster_langs[cluster_num] = [langs[i] for i in indexes]
cluster_dfs[cluster_num] = df3.loc[cluster_langs[cluster_num], :]
# Calculate pairwise distances and display
print('Cluster #{0}'.format(cluster_num))
cluster_distances[cluster_num] = pairwise_distances(cluster_dfs[cluster_num].values)
n, m = cluster_distances[cluster_num].shape
distances = set([])
for i in range(n):
for j in range(m):
if i == j:
continue
distances.add((cluster_distances[cluster_num][i, j], tuple(sorted([i, j]))))
for a in sorted(distances)[:20]:
print_sim(a[0], a[1][0], a[1][1], langs)
In [ ]: