How can you improve searching Github using machine learning? Text mining for Information Retrieval
If you spend considerable amount of time searching or browsing Github like me, this presentation is for you. I was frustrated that sometimes I starred some repos, but then forgot what was their name, and couldn't retrieve them using GitHub's search engine. In this presentation I will introduce Information Retrieval and tell about what it has in common with machine learning. I will show how unsupervised machine learning (Topic Models) can help in exploring starred repositories and for search.
- Programming language¶
- Topics word cloud¶
- Information retrieval crash course¶
- Bag of Words¶
- Topic 1¶
- Keywords¶
- Topic 2¶
- Keywords¶
- Topic 3¶
- Keywords¶
- Topic 4¶
- Keywords¶
- Topic 5¶
- Keywords¶
- Topic 6¶
- Keywords¶
- Topic 7¶
- Keywords¶
- Topic 8¶
- Keywords¶
- Topic 9¶
- Keywords¶
- Topic 10¶
- Keywords¶
- More¶
import re
import tqdm
import requests
import numpy as np
from markdown import markdown
import nltk
import pandas as pd
from pandas.io.json import json_normalize
from gensim import summarization
from sklearn import decomposition, feature_extraction, manifold
from sklearn.feature_extraction import stop_words
from sklearn import pipeline
import rank_bm25
import seaborn as sns
import wordcloud
import matplotlib.pyplot as plt
from IPython.display import Markdown, display
import bokeh.model
import bokeh.plotting
import bokeh.io
import re
import umap
from sklearn import metrics
import holoviews as hv
from holoviews import opts
from holoviews.operation.datashader import datashade, dynspread
from holoviews.operation import decimate
import requests
from io import StringIO
bokeh.io.output_notebook()
plt.style.use('ggplot')
def printmd(string):
display(Markdown(string))
def get_word_cloud(texts):
text = ' '.join(texts)
return wordcloud.WordCloud(max_font_size=40).generate(text)
def show_word_cloud(wc, figure_kwargs={'figsize': (8, 5)}):
plt.figure(**figure_kwargs)
plt.imshow(wc)
plt.axis('off')
plt.show()
def show_word_cloud_from_texts(text_column):
texts = text_column.fillna('').values
cloud = get_word_cloud(texts)
show_word_cloud(cloud)
Note that I barely know GraphQL: I made this query in Github's API explorer
def run_query(query, key): # A simple function to use requests.post to make the API call. Note the json= section.
headers = {'Authorization': 'token ' + key}
request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
if request.status_code == 200:
return request.json()
else:
raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))
# The GraphQL query (with a few aditional bits included) itself defined as a multi-line string.
def get_next_paged_result(result, key):
if result is None:
end_cursor_string = ''
else:
end_cursor_string = 'after: "{}"'.format(result['data']['viewer']['starredRepositories']['pageInfo']['endCursor'])
new_query_string = """{{
viewer {{
starredRepositories(first: 100, {}) {{
pageInfo {{
startCursor
hasNextPage
endCursor
}}
nodes {{
name
owner {{
login
}}
description
primaryLanguage {{
name
}}
repositoryTopics(first: 10) {{
nodes {{
topic {{
name
}}
}}
}}
object(expression: "master:README.md") {{
... on Blob {{
text
}}
}}
}}
}}
}}
}}""".format(end_cursor_string)
return run_query(new_query_string, key)
def get_starred_repo_information(key, n_pages=6):
next_result = None
starred_repo_information = []
for __ in tqdm.tqdm(range(n_pages)):
next_result = get_next_paged_result(next_result, key)
starred_repo_information = starred_repo_information + next_result['data']['viewer']['starredRepositories']['nodes']
return starred_repo_information
Put your Github token to github_auth_key.txt
You have to do this because, unfortunately, for now there is no way to use GraphQL without authentication.
# key = open('github_auth_key.txt', 'r').read().strip()
# starred_repo_information = get_starred_repo_information()
Since this notebook is hosted on github pages, we'll use cached dataset
import pickle
starred_repo_information = pickle.load(open('starred_repo_information.pkl', 'rb'))
from nltk import stem, tokenize
lemmatizer = stem.WordNetLemmatizer()
lemmatizer.lemmatize('repositories')
def clean_and_stem(text):
cleaned_text = re.sub('^[0-9a-zA-Z]+', ' ' , text.lower())
return ' '.join([lemmatizer.lemmatize(w) for w in tokenize.wordpunct_tokenize(cleaned_text)])
def get_cleaned_starred_repositories_df(repo_information):
repo_df = json_normalize(repo_information)
repo_df.index = repo_df['name']
repo_df.drop('name', axis=1, inplace=True)
repo_df['primaryLanguage'] = repo_df['primaryLanguage.name']
repo_df.drop('primaryLanguage.name', axis=1)
repo_df['topics'] = repo_df['repositoryTopics.nodes'].apply(lambda recs: [r['topic']['name'] for r in recs])
repo_df['topics'] = repo_df['topics'].apply(lambda ts: [lemmatizer.lemmatize(t) for t in ts])
repo_df['description'].fillna('', inplace=True)
repo_df['description_stemmed'] = repo_df['description'].apply(clean_and_stem)
repo_df['description_keywords'] = repo_df['description_stemmed'].apply(summarization.keywords)
repo_df['description_length'] = repo_df['description'].str.split().apply(lambda l: 0 if l is None else len(l))
repo_df = repo_df[repo_df['description_length'] > 0]
return repo_df
def get_topic_representant_indices(topic_weights, topic_idx, num_representants=5):
indices = topic_weights[:, topic_idx].argsort()[::-1]
return indices[:num_representants]
def get_repos_representing_topic(repo_df, topic_weights, topic_idx, num_representants=5):
return repo_df.iloc[get_topic_representant_indices(topic_weights, topic_idx, num_representants)]
def plot_description_lengths(description_lengths):
hist, edges = np.histogram(description_lengths.values, bins=25)
median_description_length = description_lengths.median()
mean_description_length = description_lengths.mean()
p = bokeh.plotting.figure(
title='Description length',
x_axis_label='words in description',
y_axis_label='number of repositories',
plot_height=600, plot_width=800)
p.quad(top=hist, left=edges[:-1], right=edges[1:], bottom=0)
p.line([median_description_length, median_description_length], [0, 140], line_color='red')
bokeh.plotting.show(p)
from bokeh import palettes
def plot_2d_data(data, text_label, cls, show_text=True, subset=None):
palette = palettes.d3['Category20']
x, y = data[:, 0], data[:, 1]
source_df = pd.DataFrame({'x': x, 'y': y, 'text_label': text_label, 'color': [palette[c + 3][c] for c in cls]})
source = bokeh.models.ColumnDataSource(source_df)
TOOLS="hover,crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,"
p = bokeh.plotting.figure(tools=TOOLS, plot_width=800, plot_height=600)
p.scatter(x='x', y='y', source=source, fill_color='color', line_color='color')
if subset is not None:
text_labels = bokeh.models.LabelSet(x='x', y='y', text='text_label', level='glyph',
x_offset=5, y_offset=5, source=bokeh.models.ColumnDataSource(source_df.iloc[subset]), render_mode='canvas', text_font_size='7pt')
p.add_layout(text_labels)
bokeh.plotting.show(p)
starred_repo_df = get_cleaned_starred_repositories_df(starred_repo_information)
Primary language
language_counts = starred_repo_df['primaryLanguage'].fillna('unspecified').value_counts()
Programming language¶
p = bokeh.plotting.figure(x_range=list(language_counts.index), title='Repository number by language')
p.vbar(x=language_counts.index, top=language_counts, width=1)
p.xaxis.major_label_orientation = "vertical"
bokeh.plotting.show(p)
print(starred_repo_df['description_length'].describe())
plot_description_lengths(starred_repo_df['description_length'])
starred_repo_df = starred_repo_df[starred_repo_df['description_length'] > 5]
Topics word cloud¶
show_word_cloud_from_texts(starred_repo_df['topics'].apply(' '.join))
Descriptions word cloud
show_word_cloud_from_texts(starred_repo_df['description_stemmed'])
show_word_cloud_from_texts(starred_repo_df['description_keywords'])
Information retrieval crash course¶
We have a collection of documents $d_i$ and want to find some documents.
We formulate a query $q$ for which the system returns some documents with relevance scores.
System can be evaluated (for queries with known responses) as a classifier.
Because of that we use precision and recall scores (why these instead of accuracy?)
Also we can use ranking metrics.
Approaches¶
-
substring matching
-
break down texts into word and match them
Honorable mention - inverted index¶
The Vector Space Model¶
-
represent documents and queries as vectors
-
use similarity/disssimilarity (distance) to score vectors for a query
Bag of Words¶
-
TF-IDF, BM-25 can be interpreted as this - similarity is calculated as dot product in appropriate space
-
sklearn.text.preprocessing.{Count|TfIdf}Vectorizer
Now we can use machine learning!
import rank_bm25
class SearchEngine:
def __init__(self, df, bm25_cls=rank_bm25.BM25Okapi, text_col='text'):
self.bm25 = bm25_cls(df[text_col].str.split())
self.df = df
def search(self, query, k=100):
scores = self.bm25.get_scores(query.split())
#scores = scores[scores > 0]
relevant_indices = np.argsort(-scores)[:k]
return self.df.iloc[relevant_indices[scores[relevant_indices] > 0]]
search_engine = SearchEngine(starred_repo_df, text_col='description_stemmed')
search_engine.search('information retrieval')
BM25 - a comment¶
BM comes from 'Best Match'
Difference between TF-IDF: is not symmetrical (query and documents are treated in a different way, for example because their lengths tend to differ)
Bag of Words¶
Pros:
- can be very fast
- easy to vectorize
- good if you actually want to search by phrase
Cons:
- extremely high dimensionality - use sparse vectors or die (waiting or RAM)
- troubles with polysemous words
- vocabulary mismatch problem - synonymy
from sklearn import feature_extraction
vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2))
term_document_matrix = vectorizer.fit_transform(starred_repo_df['description_stemmed'])
term_document_matrix.shape
search_engine.search('image')
search_engine.search('picture')
Can we do anything with nonexact matches?¶
-
use different representation (word embeddings et c)
-
change original representation
Dimensionality reduction on text¶
-
Latent Semantic Indexing (
sklearn.decomposition.TruncatedSVD) -
topic models
Topic modeling (repo descriptions)¶
Idea - documents are probability distributions over vocabulary
Model documents as mixtures of several latent factors
This can be also considered as soft clustering (and turned into clustering by recovering the biggest component)
I used LDA and NMF here, chose NMF because results looked better
TL;DR NMF is a simpler model than LDA, probably better here since we have only ~500 examples¶
Nonnegative Matrix Factorization¶
Assume $t$ number of topics.
Find nonnegative $L, T$ minimizing
$\|\underset{n \times v}{D} - \underset{n \times t}{L}\ \underset{t \times D}{T}\|^2_F$
Similar to PCA (rank constraint)
Can also add regularization
starred_repo_df.shape
import ktrain
num_topics = 10
tm = ktrain.text.get_topic_model(
starred_repo_df['description_stemmed'],
n_topics=num_topics,
model_type='nmf',
n_features=term_document_matrix.shape[1],
lda_max_iter=10,
min_df=1,
verbose=0,
hyperparam_kwargs={'nmf_alpha': 0.01, 'l1_ratio': 0.5, 'ngram_range': (1,1)}
)
tm.build(starred_repo_df['description_stemmed'])
tm.print_topics(show_counts=True)
reduced_term_document_matrix = tm.predict(starred_repo_df['description_stemmed'])
representative_repos = [get_repos_representing_topic(starred_repo_df, reduced_term_document_matrix, topic)[['description_stemmed']] for topic in range(num_topics)]
Topic keywords and most representative repositories¶
topic_words = tm.get_topics()
for topic in range(num_topics):
printmd("""------\n# Topic {}\n------""".format(topic+1))
show_word_cloud_from_texts(representative_repos[topic]['description_stemmed'])
printmd('# Keywords')
display(set(topic_words[topic].split()))
printmd('## **repositories representative for {}th topic:**'.format(topic + 1))
display(representative_repos[topic])
print()
tm.train_recommender(n_neighbors=3, metric='cosine')
def show_results(query):
for res in tm.recommend(query, n=5, n_neighbors=3):
print(res[0])
print()
show_results('search')
show_results('query')
show_results('information retrieval')
Visualizing repository 2D projection¶
Remark: ktrain also has visualization capability but I liked UMAP better
umap_red = umap.UMAP(metric='precomputed')
umap_features = umap_red.fit_transform(metrics.pairwise.cosine_distances(reduced_term_document_matrix, reduced_term_document_matrix))
representatives = pd.concat(representative_repos)
representative_indices = np.where(starred_repo_df.index.isin(representatives.index))
umap_df = pd.DataFrame(umap_features)
umap_df.columns = ['x', 'y']
umap_df['name'] = starred_repo_df.index
umap_df['topic'] = np.argmax(reduced_term_document_matrix, axis=1)
hv.notebook_extension('bokeh','matplotlib')
opts.defaults(
opts.RGB(width=400, height=400, xaxis=None, yaxis=None, show_grid=False, bgcolor="black"))
points = hv.Points(umap_df)
labels = hv.Labels(umap_df, ['x','y'], 'name')
points.opts(
opts.Points(
color='topic',
cmap='Category20',
tools=['zoom_in', 'zoom_out', 'hover'], width=800, height=600),
opts.Overlay(width=800, height=600),
)