import re
import tqdm
import requests

import numpy as np

from markdown import markdown
import nltk


import pandas as pd
from pandas.io.json import json_normalize
from gensim import summarization

from sklearn import decomposition, feature_extraction, manifold
from sklearn.feature_extraction import stop_words
from sklearn import pipeline

import rank_bm25


import seaborn as sns
import wordcloud

import matplotlib.pyplot as plt
from IPython.display import Markdown, display

import bokeh.model
import bokeh.plotting
import bokeh.io
import re

import umap
from sklearn import metrics
import holoviews as hv
from holoviews import opts
from holoviews.operation.datashader import datashade, dynspread
from holoviews.operation import decimate


import requests
from io import StringIO

bokeh.io.output_notebook()
Loading BokehJS ...
plt.style.use('ggplot')
def printmd(string):
    display(Markdown(string))
    

def get_word_cloud(texts):
    text = ' '.join(texts)
    return wordcloud.WordCloud(max_font_size=40).generate(text)


def show_word_cloud(wc, figure_kwargs={'figsize': (8, 5)}):
    plt.figure(**figure_kwargs)
    plt.imshow(wc)
    plt.axis('off')
    plt.show()

  
def show_word_cloud_from_texts(text_column):
    texts = text_column.fillna('').values
    cloud = get_word_cloud(texts)
    show_word_cloud(cloud)

Note that I barely know GraphQL: I made this query in Github's API explorer

def run_query(query, key): # A simple function to use requests.post to make the API call. Note the json= section.
    headers = {'Authorization': 'token ' + key}
    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
    if request.status_code == 200:
        return request.json()
    else:
        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))

        
# The GraphQL query (with a few aditional bits included) itself defined as a multi-line string.       
def get_next_paged_result(result, key):
  if result is None:
    end_cursor_string = ''
  else:
    end_cursor_string = 'after: "{}"'.format(result['data']['viewer']['starredRepositories']['pageInfo']['endCursor'])
  new_query_string = """{{
    viewer {{
      starredRepositories(first: 100, {}) {{
        pageInfo {{
          startCursor
          hasNextPage
          endCursor
        }}
        nodes {{
          name
          owner {{
            login
          }}
          description
          primaryLanguage {{
            name
          }}
          repositoryTopics(first: 10) {{
            nodes {{
              topic {{
                name
              }}
            }}
          }}
          object(expression: "master:README.md") {{
            ... on Blob {{
              text
            }}
          }}
        }}
      }}
    }}
  }}""".format(end_cursor_string)
  return run_query(new_query_string, key)
def get_starred_repo_information(key, n_pages=6):
    next_result = None
    starred_repo_information = []
    for __ in tqdm.tqdm(range(n_pages)):
      next_result = get_next_paged_result(next_result, key)
      starred_repo_information = starred_repo_information + next_result['data']['viewer']['starredRepositories']['nodes']
    return starred_repo_information

Put your Github token to github_auth_key.txt

You have to do this because, unfortunately, for now there is no way to use GraphQL without authentication.

# key = open('github_auth_key.txt', 'r').read().strip()
# starred_repo_information = get_starred_repo_information()

Since this notebook is hosted on github pages, we'll use cached dataset

import pickle
starred_repo_information = pickle.load(open('starred_repo_information.pkl', 'rb')) 
from nltk import stem, tokenize
lemmatizer = stem.WordNetLemmatizer()
lemmatizer.lemmatize('repositories')


def clean_and_stem(text):
    cleaned_text = re.sub('^[0-9a-zA-Z]+', ' ' , text.lower())
    return ' '.join([lemmatizer.lemmatize(w) for w in tokenize.wordpunct_tokenize(cleaned_text)])


def get_cleaned_starred_repositories_df(repo_information):
    repo_df = json_normalize(repo_information)
    repo_df.index = repo_df['name']
    repo_df.drop('name', axis=1, inplace=True)
    repo_df['primaryLanguage'] = repo_df['primaryLanguage.name']
    repo_df.drop('primaryLanguage.name', axis=1)
    repo_df['topics'] = repo_df['repositoryTopics.nodes'].apply(lambda recs: [r['topic']['name'] for r in recs])
    repo_df['topics'] = repo_df['topics'].apply(lambda ts: [lemmatizer.lemmatize(t) for t in ts])
    repo_df['description'].fillna('', inplace=True)
    repo_df['description_stemmed'] = repo_df['description'].apply(clean_and_stem)
    repo_df['description_keywords'] = repo_df['description_stemmed'].apply(summarization.keywords)
    repo_df['description_length'] = repo_df['description'].str.split().apply(lambda l: 0 if l is None else len(l))
    repo_df = repo_df[repo_df['description_length'] > 0]
    return repo_df
def get_topic_representant_indices(topic_weights, topic_idx, num_representants=5):
    indices = topic_weights[:, topic_idx].argsort()[::-1]
    return indices[:num_representants]


def get_repos_representing_topic(repo_df, topic_weights, topic_idx, num_representants=5):
    return repo_df.iloc[get_topic_representant_indices(topic_weights, topic_idx, num_representants)]
def plot_description_lengths(description_lengths):
    hist, edges = np.histogram(description_lengths.values, bins=25)
    median_description_length = description_lengths.median()
    mean_description_length = description_lengths.mean()

    p = bokeh.plotting.figure(
        title='Description length',
        x_axis_label='words in description',
        y_axis_label='number of repositories',
        plot_height=600, plot_width=800)
    p.quad(top=hist, left=edges[:-1], right=edges[1:], bottom=0)
    p.line([median_description_length, median_description_length], [0, 140], line_color='red')
    bokeh.plotting.show(p)
from bokeh import palettes


def plot_2d_data(data, text_label, cls, show_text=True, subset=None):
    palette = palettes.d3['Category20']
    x, y = data[:, 0], data[:, 1]
    source_df = pd.DataFrame({'x': x, 'y': y, 'text_label': text_label, 'color': [palette[c + 3][c] for c in cls]})
    source = bokeh.models.ColumnDataSource(source_df)

    TOOLS="hover,crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,"
    
    p = bokeh.plotting.figure(tools=TOOLS, plot_width=800, plot_height=600)
    p.scatter(x='x', y='y', source=source, fill_color='color', line_color='color')


    if subset is not None:
        text_labels = bokeh.models.LabelSet(x='x', y='y', text='text_label', level='glyph',
                      x_offset=5, y_offset=5, source=bokeh.models.ColumnDataSource(source_df.iloc[subset]), render_mode='canvas', text_font_size='7pt')
        p.add_layout(text_labels)
    bokeh.plotting.show(p)
starred_repo_df = get_cleaned_starred_repositories_df(starred_repo_information)

Primary language

language_counts = starred_repo_df['primaryLanguage'].fillna('unspecified').value_counts()

Programming language

p = bokeh.plotting.figure(x_range=list(language_counts.index), title='Repository number by language')

p.vbar(x=language_counts.index, top=language_counts, width=1)
p.xaxis.major_label_orientation = "vertical"
bokeh.plotting.show(p)
print(starred_repo_df['description_length'].describe())
plot_description_lengths(starred_repo_df['description_length'])
count    571.000000
mean      10.082312
std        6.451048
min        1.000000
25%        6.000000
50%        9.000000
75%       12.000000
max       69.000000
Name: description_length, dtype: float64
starred_repo_df = starred_repo_df[starred_repo_df['description_length'] > 5]

Topics word cloud

show_word_cloud_from_texts(starred_repo_df['topics'].apply(' '.join))

Descriptions word cloud

show_word_cloud_from_texts(starred_repo_df['description_stemmed'])
show_word_cloud_from_texts(starred_repo_df['description_keywords'])

Information retrieval crash course

We have a collection of documents $d_i$ and want to find some documents.

We formulate a query $q$ for which the system returns some documents with relevance scores.

System can be evaluated (for queries with known responses) as a classifier.

Because of that we use precision and recall scores (why these instead of accuracy?)

Also we can use ranking metrics.

Approaches

  • substring matching

  • break down texts into word and match them

Honorable mention - inverted index

The Vector Space Model

  • represent documents and queries as vectors

  • use similarity/disssimilarity (distance) to score vectors for a query

Bag of Words

  • TF-IDF, BM-25 can be interpreted as this - similarity is calculated as dot product in appropriate space

  • sklearn.text.preprocessing.{Count|TfIdf}Vectorizer

Now we can use machine learning!

import rank_bm25 


class SearchEngine:
    
    def __init__(self, df, bm25_cls=rank_bm25.BM25Okapi, text_col='text'):
        self.bm25 = bm25_cls(df[text_col].str.split())
        self.df = df
        
    def search(self, query, k=100):
        scores = self.bm25.get_scores(query.split())
        #scores = scores[scores > 0]
        relevant_indices = np.argsort(-scores)[:k]
        return self.df.iloc[relevant_indices[scores[relevant_indices] > 0]]

    
search_engine = SearchEngine(starred_repo_df, text_col='description_stemmed')
search_engine.search('information retrieval')
description owner.login primaryLanguage.name repositoryTopics.nodes object.text primaryLanguage object topics description_stemmed description_keywords description_length
name
musicinformationretrieval.com Instructional notebooks on music information r... stevetjoa Jupyter Notebook [{'topic': {'name': 'ipython-notebook'}}, {'to... stanford-mir\n============\n\n[![Stories in Re... Jupyter Notebook NaN [ipython-notebook, music-information-retrieval... notebook on music information retrieval . 6
anserini A Lucene toolkit for replicable information re... castorini Java [{'topic': {'name': 'information-retrieval'}},... Anserini\n========\n[![Build Status](https://t... Java NaN [information-retrieval, lucene] lucene toolkit for replicable information retr... retrieval 8
awesome-information-retrieval A curated list of awesome information retrieva... harpribot NaN [] # Awesome Information Retrieval [![Awesome](ht... NaN NaN [] curated list of awesome information retrieval ... retrieval 8
LIRE Open source library for content based image re... dermotte Java [{'topic': {'name': 'image-retrieval'}}, {'top... # LIRE - Lucene Image Retrieval\nLIRE (Lucene ... Java NaN [image-retrieval, lira, multimedia] source library for content based image retriev... retrieval 12
wikIR A python tool for building large scale Wikiped... getalp Python [] # WIKIR\nA python tool for building large scal... Python NaN [] python tool for building large scale wikipedia... retrieval\nlarge 11
pytrec_eval pytrec_eval is an Information Retrieval evalua... cvangysel C++ [{'topic': {'name': 'information-retrieval'}},... pytrec_eval\n===========\n\npytrec\_eval is a ... C++ NaN [information-retrieval, evaluation] _eval is an information retrieval evaluation t... evaluation 14
query-expansion Developing different methods for expanding a q... phosseini Python [] # query-expansion\n\nThis repository is dedica... Python NaN [] different method for expanding a query / topic... expanding\nexpanded\nquery 19
cnnimageretrieval-pytorch CNN Image Retrieval in PyTorch: Training and e... filipradenovic Python [{'topic': {'name': 'image-retrieval'}}, {'top... ## CNN Image Retrieval in PyTorch: Training an... Python NaN [image-retrieval, convolutional-neural-network... image retrieval in pytorch : training and eval... retrieval 14
StarSpace Learning embeddings for classification, retrie... facebookresearch C++ [] <p align="center"><img width="15%" src="exampl... C++ NaN [] embeddings for classification , retrieval and ... 7
sparse_recovery noiseless/nonnegative sparse recovery and feat... NLPrinceton Python [] # sparse_recovery\n\nThis module provides solv... Python NaN [] / nonnegative sparse recovery and feature retr... sparse 9
revisitop Revisiting Oxford and Paris: Large-Scale Image... filipradenovic Python [{'topic': {'name': 'image-retrieval'}}, {'top... # Revisiting Oxford and Paris: Large-Scale Ima... Python NaN [image-retrieval, matlab, python] oxford and paris : large - scale image retriev... large 8
CBIR 🏞 A content-based image retrieval (CBIR) system pochih Python [{'topic': {'name': 'image-retrieval'}}, {'top... [![Open Source Love](https://badges.frapsoft.c... Python NaN [image-retrieval, computer-vision, gabor, hog,... 🏞 a content - based image retrieval ( cbir ) s... based 7
deep-image-retrieval End-to-end learning of deep visual representat... almazan Python [] # Deep Image Retrieval\n\nThis repository cont... Python NaN [] - to - end learning of deep visual representat... visual 9
minmaxcsa MinMax Circular Sector Arc for External Plagia... duartefellipe Python [{'topic': {'name': 'plagiarism-detection'}}, ... ## Minmax Circular Sector Arcs (MinMaxCSA): A ... Python NaN [plagiarism-detection, locality-sensitive-hash... circular sector arc for external plagiarism ’ ... sector 11
ir-python A python implementation for information retrie... zxzlogic Python [] # ir-python\nA python implementation for infor... Python NaN [] python implementation for information retrieva... retrieval\npython\nindex\nindexing\nsafe\ngoogle 34
ParetoMTL Code for Neural Information Processing Systems... Xi-L Python [] # Pareto Multi-Task Learning\nCode for Neural ... Python NaN [] for neural information processing system ( neu... information 12
Open-IE-Papers Open Information Extraction (OpenIE) and Open ... NPCai NaN [{'topic': {'name': 'openie'}}, {'topic': {'na... # Table of Contents\n\n1. [General](#general)\... NaN NaN [openie, literature-review, paper, nlp, inform... information extraction ( openie ) and open rel... extraction 12
hashing-baseline-for-image-retrieval :octocat:Various hashing methods for image ret... willard-yuan MATLAB [{'topic': {'name': 'hashing-library'}}, {'top... # HABIR Toolkit\n\n[![License](https://img.shi... MATLAB NaN [hashing-library, image-retrieval, ann] : octocat : various hashing method for image r... 11
paws This dataset contains 108,463 human-labeled an... google-research-datasets Python [] # PAWS: Paraphrase Adversaries from Word Scram... Python NaN [] dataset contains 108 , 463 human - labeled and... labeled\nstructure 28
berkeley-doc-summarizer The Berkeley Document Summarizer is a learning... gregdurrett Scala [] berkeley-doc-summarizer\n=====================... Scala NaN [] berkeley document summarizer is a learning - b... document\nsyntactic\nbased 28

BM25 - a comment

BM comes from 'Best Match'

Difference between TF-IDF: is not symmetrical (query and documents are treated in a different way, for example because their lengths tend to differ)

Bag of Words

Pros:

  • can be very fast
  • easy to vectorize
  • good if you actually want to search by phrase

Cons:

  • extremely high dimensionality - use sparse vectors or die (waiting or RAM)
  • troubles with polysemous words
  • vocabulary mismatch problem - synonymy
from sklearn import feature_extraction 

vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2))
term_document_matrix = vectorizer.fit_transform(starred_repo_df['description_stemmed']) 
term_document_matrix.shape
(459, 5005)
search_engine.search('image')
description owner.login primaryLanguage.name repositoryTopics.nodes object.text primaryLanguage object topics description_stemmed description_keywords description_length
name
open-images Build an example image classifier using Google... quiltdata Jupyter Notebook [] # open images\n\nThis repository contains the ... Jupyter Notebook NaN [] an example image classifier using google open ... image 10
nsfw_data_source_urls Collection of NSFW images URLs for the purpose... EBazarov NaN [] # NSFW data source URLs\n\n## Description\n\nR... NaN NaN [] of nsfw image url for the purpose of training ... 14
cnnimageretrieval-pytorch CNN Image Retrieval in PyTorch: Training and e... filipradenovic Python [{'topic': {'name': 'image-retrieval'}}, {'top... ## CNN Image Retrieval in PyTorch: Training an... Python NaN [image-retrieval, convolutional-neural-network... image retrieval in pytorch : training and eval... retrieval 14
FUNIT_tensorflow Tensorflow Implementation of FUNIT: Few-Shot U... zhangqianhui Python [{'topic': {'name': 'image-to-image-translatio... # FUNIT_tensorflow\nTensorflow Implementation ... Python NaN [image-to-image-translation, few-shot-learning... implementation of funit : few - shot unsupervi... 8
imagehash A Python Perceptual Image Hashing Module JohannesBuchner Python [] NaN Python NaN [] python perceptual image hashing module perceptual 6
nsfw_data_scraper Collection of scripts to aggregate image data ... alex000kim Shell [{'topic': {'name': 'nsfw-classifier'}}, {'top... # NSFW Data Scraper\n\n## Note: use with cauti... Shell NaN [nsfw-classifier, nsfw, deep-learning, content... of script to aggregate image data for the purp... image 16
image-to-image-papers 🦓<->🦒 🌃<->🌆 A collection of image to image pa... lzhbrian NaN [{'topic': {'name': 'image-to-image'}}, {'topi... # Image-to-Image papers\n\nA collection of ima... NaN NaN [image-to-image, generative-adversarial-networ... 🦓<->🦒 🌃<->🌆 a collection of image to image pap... constantly 13
snowy Small Image Library for Python 3 prideout Python [{'topic': {'name': 'python'}}, {'topic': {'na... [![Build Status](https://travis-ci.org/prideou... Python NaN [python, image-processing] small image library for python 3 6
image-match 🎇 Quickly search over billions of images EdjoLabs Python [{'topic': {'name': 'image-analysis'}}, {'topi... [![PyPI](https://img.shields.io/pypi/status/im... Python NaN [image-analysis, image-signatures, python, sea... 🎇 quickly search over billion of image 7
revisitop Revisiting Oxford and Paris: Large-Scale Image... filipradenovic Python [{'topic': {'name': 'image-retrieval'}}, {'top... # Revisiting Oxford and Paris: Large-Scale Ima... Python NaN [image-retrieval, matlab, python] oxford and paris : large - scale image retriev... large 8
nsfw-v2 NSFW Image Detector with REST interface develo... sajithm Python [{'topic': {'name': 'python'}}, {'topic': {'na... # nsfw-v2\nAn NSFW detector serving responses ... Python NaN [python, nsfw-recognition, kera, flask, convol... image detector with rest interface developed u... interface 11
Image-to-Image-Search A reverse image search engine powered by elast... sethuiyer Python [{'topic': {'name': 'deep-learning'}}, {'topic... <img src="static/logo.jpg"/>\n\nSmartSearch is... Python NaN [deep-learning, search-engine, elasticsearch, ... reverse image search engine powered by elastic... search 11
MAX-Object-Detector Localize and identify multiple objects in a s... IBM Python [{'topic': {'name': 'docker-image'}}, {'topic'... [![Build Status](https://travis-ci.com/IBM/MAX... Python NaN [docker-image, machine-learning, machine-learn... localize and identify multiple object in a sin... multiple 9
imgdupes Finding and deleting near-duplicate images bas... knjcode Python [{'topic': {'name': 'image'}}, {'topic': {'nam... # imgdupes\n\n`imgdupes` is a command line too... Python NaN [image, dedupe, perceptual-hashing, perceptual... and deleting near - duplicate image based on p... image 9
albumentations fast image augmentation library and easy to us... albumentations-team Python [{'topic': {'name': 'image-augmentation'}}, {'... # Albumentations\n[![PyPI version](https://bad... Python NaN [image-augmentation, machine-learning, augment... image augmentation library and easy to use wra... augmentation 12
CBIR 🏞 A content-based image retrieval (CBIR) system pochih Python [{'topic': {'name': 'image-retrieval'}}, {'top... [![Open Source Love](https://badges.frapsoft.c... Python NaN [image-retrieval, computer-vision, gabor, hog,... 🏞 a content - based image retrieval ( cbir ) s... based 7
deep-image-retrieval End-to-end learning of deep visual representat... almazan Python [] # Deep Image Retrieval\n\nThis repository cont... Python NaN [] - to - end learning of deep visual representat... visual 9
LIRE Open source library for content based image re... dermotte Java [{'topic': {'name': 'image-retrieval'}}, {'top... # LIRE - Lucene Image Retrieval\nLIRE (Lucene ... Java NaN [image-retrieval, lira, multimedia] source library for content based image retriev... retrieval 12
DeOldify A Deep Learning based project for colorizing a... jantic Jupyter Notebook [] \n# DeOldify\n\nImage [<img src="https://colab... Jupyter Notebook NaN [] deep learning based project for colorizing and... old 13
hashing-baseline-for-image-retrieval :octocat:Various hashing methods for image ret... willard-yuan MATLAB [{'topic': {'name': 'hashing-library'}}, {'top... # HABIR Toolkit\n\n[![License](https://img.shi... MATLAB NaN [hashing-library, image-retrieval, ann] : octocat : various hashing method for image r... 11
dl-training-datasets Set of scripts to download datasets of images ... SaMnCo Shell [] # About this repository\n\n**Notes 2016-01-08*... Shell NaN [] of script to download datasets of image and cr... annotation 17
colorization Automatic colorization using deep neural netwo... richzhang Jupyter Notebook [{'topic': {'name': 'caffe'}}, {'topic': {'nam... <!--<h3><b>Colorful Image Colorization</b></h3... Jupyter Notebook NaN [caffe, colorization, automatic-colorization, ... colorization using deep neural network . " col... neural 12
image_captioning Tensorflow implementation of "Show, Attend an... DeepRNN Python [] ### Introduction\nThis neural system for image... Python NaN [] implementation of " show , attend and tell : n... caption 14
Colorizing-with-GANs Grayscale Image Colorization with Generative A... ImagingLab Python [{'topic': {'name': 'deep-learning'}}, {'topic... # Image Colorization with Generative Adversari... Python NaN [deep-learning, generative-adversarial-network... image colorization with generative adversarial... arxiv 8
tencent-ml-images Largest multi-label image database; ResNet-101... Tencent Python [{'topic': {'name': 'database'}}, {'topic': {'... # Tencent ML-Images\n\nThis repository introdu... Python NaN [database, deep-learning, computer-vision] multi - label image database ; resnet - 101 mo... label 11
DeepNude-an-Image-to-Image-technology DeepNude's algorithm and general image generat... yuanxiaosc Python [{'topic': {'name': 'image-to-image'}}, {'topi... # DeepNude-an-Image-to-Image-technology\nGAN e... Python NaN [image-to-image, pix2pix, cycle-gan, dcgan, st... ' s algorithm and general image generation the... general\ngeneration\npix\nmodel 22
pytorch-ssd MobileNetV1, MobileNetV2, VGG based SSD/SSD-li... qfgaohao Python [{'topic': {'name': 'ssd'}}, {'topic': {'name'... # Single Shot MultiBox Detector Implementation... Python NaN [ssd, pytorch, open-images, object-detection] , mobilenetv2 , vgg based ssd / ssd - lite imp... support\nssd\ndataset 28
search_engine.search('picture')
description owner.login primaryLanguage.name repositoryTopics.nodes object.text primaryLanguage object topics description_stemmed description_keywords description_length
name

Can we do anything with nonexact matches?

  • use different representation (word embeddings et c)

  • change original representation

Dimensionality reduction on text

  • Latent Semantic Indexing (sklearn.decomposition.TruncatedSVD)

  • topic models

Topic modeling (repo descriptions)

Idea - documents are probability distributions over vocabulary

Model documents as mixtures of several latent factors

This can be also considered as soft clustering (and turned into clustering by recovering the biggest component)

I used LDA and NMF here, chose NMF because results looked better

TL;DR NMF is a simpler model than LDA, probably better here since we have only ~500 examples

Nonnegative Matrix Factorization

Assume $t$ number of topics.

Find nonnegative $L, T$ minimizing

$\|\underset{n \times v}{D} - \underset{n \times t}{L}\ \underset{t \times D}{T}\|^2_F$

Similar to PCA (rank constraint)

Can also add regularization

starred_repo_df.shape
(459, 11)
import ktrain


num_topics = 10 
tm = ktrain.text.get_topic_model(
    starred_repo_df['description_stemmed'],
    n_topics=num_topics,
    model_type='nmf',
    n_features=term_document_matrix.shape[1],
    lda_max_iter=10,
    min_df=1,
    verbose=0,
    hyperparam_kwargs={'nmf_alpha': 0.01, 'l1_ratio': 0.5, 'ngram_range': (1,1)}
)
using Keras version: 2.2.4-tf
tm.build(starred_repo_df['description_stemmed'])
tm.print_topics(show_counts=True)
topic:2 | count:50 | python using module retrieval http tool algorithm including leveldb implementation
topic:4 | count:33 | text summarization model using evaluation document extractive abstractive deep framework
topic:8 | count:32 | network neural paper list curated code shot zero repository resource
topic:0 | count:29 | learning deep machine shot book interactive scalable source representation model
topic:9 | count:27 | data library structure topological science manifold graph point neighborhood notebook
topic:3 | count:26 | language processing natural nlp polish art course state list datasets
topic:6 | count:22 | image pytorch retrieval implementation based nsfw open training classifier information
topic:5 | count:15 | search com vector talk semantic expansion engine query work build
topic:7 | count:13 | library machine support causal tree framework regression classification gradient inference
topic:1 | count:3 | model task code training semantic similarity achieve paper sentence result
reduced_term_document_matrix = tm.predict(starred_repo_df['description_stemmed'])
representative_repos = [get_repos_representing_topic(starred_repo_df, reduced_term_document_matrix, topic)[['description_stemmed']] for topic in range(num_topics)]

Topic keywords and most representative repositories

topic_words = tm.get_topics()

for topic in range(num_topics):
    printmd("""------\n# Topic {}\n------""".format(topic+1))
    show_word_cloud_from_texts(representative_repos[topic]['description_stemmed'])
    printmd('# Keywords')
    display(set(topic_words[topic].split()))
    printmd('## **repositories representative for {}th topic:**'.format(topic + 1))
    display(representative_repos[topic])
    print()

Topic 1


Keywords

{'book',
 'deep',
 'interactive',
 'learning',
 'machine',
 'model',
 'representation',
 'scalable',
 'shot',
 'source'}

repositories representative for 1th topic:

description_stemmed
name
h2o-3 source fast scalable machine learning platform...
vowpal_wabbit wabbit is a machine learning system which push...
mxnet-the-straight-dope interactive book on deep learning . much easy ...
LearningToCompare_ZSL code for cvpr 2018 paper : learning to compare...
d2l-en into deep learning : an interactive deep learn...


Topic 2


Keywords

{'achieve',
 'code',
 'model',
 'paper',
 'result',
 'semantic',
 'sentence',
 'similarity',
 'task',
 'training'}

repositories representative for 2th topic:

description_stemmed
name
iclr2016 code for training all model in the iclr paper ...
ir-python python implementation for information retrieva...
anchor-baggage code for the article " building topic model ba...
multifit code to reproduce result from paper " multifit...
sentence-similarity implementation of various deep learning model ...


Topic 3


Keywords

{'algorithm',
 'http',
 'implementation',
 'including',
 'leveldb',
 'module',
 'python',
 'retrieval',
 'tool',
 'using'}

repositories representative for 3th topic:

description_stemmed
name
ir-python python implementation for information retrieva...
data-science-ipython-notebooks science python notebook : deep learning ( tens...
boilerpipe3 fork of boilerpipe with python 3 and small fix...
gputil python module for getting the gpu status from ...
xlearn performance , easy - to - use , and scalable m...


Topic 4


Keywords

{'art',
 'course',
 'datasets',
 'language',
 'list',
 'natural',
 'nlp',
 'polish',
 'processing',
 'state'}

repositories representative for 4th topic:

description_stemmed
name
NLP-progress to track the progress in natural language proc...
polish-nlp-resources - trained model and language resource for natu...
Introduction-to-Natural-Language-Processing-UMich-Coursera repository contains weekly assignment on imple...
flair very simple framework for state - of - the - a...
nlp-datasets list of free / public domain datasets with tex...


Topic 5


Keywords

{'abstractive',
 'deep',
 'document',
 'evaluation',
 'extractive',
 'framework',
 'model',
 'summarization',
 'text',
 'using'}

repositories representative for 5th topic:

description_stemmed
name
tf-textanalysis-gcp how to perform text preprocessing using bigque...
nnsum extractive neural network text summarization l...
jann . i am jann . i am text input - text output ch...
Kashgari is a production - ready nlp transfer learning ...
python-sirajnet deep complicated nlp to turn your text into my...


Topic 6


Keywords

{'build',
 'com',
 'engine',
 'expansion',
 'query',
 'search',
 'semantic',
 'talk',
 'vector',
 'work'}

repositories representative for 6th topic:

description_stemmed
name
VectorsInSearch . com repo to accompany the dice . com ' vecto...
columbiau-rocchio-search-query-expander rocchio query expansion - similar to " related...
Kaggle_CrowdFlower place solution for search result relevance com...
gnes is generic neural elastic search , a cloud - n...
Image-to-Image-Search reverse image search engine powered by elastic...


Topic 7


Keywords

{'based',
 'classifier',
 'image',
 'implementation',
 'information',
 'nsfw',
 'open',
 'pytorch',
 'retrieval',
 'training'}

repositories representative for 7th topic:

description_stemmed
name
cnnimageretrieval-pytorch image retrieval in pytorch : training and eval...
pytorch-ssd , mobilenetv2 , vgg based ssd / ssd - lite imp...
nsfw_data_source_urls of nsfw image url for the purpose of training ...
nsfw_data_scraper of script to aggregate image data for the purp...
FUNIT_tensorflow implementation of funit : few - shot unsupervi...


Topic 8


Keywords

{'causal',
 'classification',
 'framework',
 'gradient',
 'inference',
 'library',
 'machine',
 'regression',
 'support',
 'tree'}

repositories representative for 8th topic:

description_stemmed
name
adversarial-robustness-toolbox library for adversarial machine learning ( eva...
catboost fast , scalable , high performance gradient bo...
simpletransformers made simple with training , evaluation , and p...
dowhy is a python library for causal inference that ...
nmslib - metric space library ( nmslib ): an efficien...


Topic 9


Keywords

{'code',
 'curated',
 'list',
 'network',
 'neural',
 'paper',
 'repository',
 'resource',
 'shot',
 'zero'}

repositories representative for 9th topic:

description_stemmed
name
ZeroShotCapsule for paper " zero - shot user intent detection ...
distiller network distiller by intel ai lab : a python p...
awesome-rnn neural network - a curated list of resource de...
LearningToCompare_ZSL code for cvpr 2018 paper : learning to compare...
Inhibited-softmax with code for paper " inhibited softmax for un...


Topic 10


Keywords

{'data',
 'graph',
 'library',
 'manifold',
 'neighborhood',
 'notebook',
 'point',
 'science',
 'structure',
 'topological'}

repositories representative for 10th topic:

description_stemmed
name
topopy library for computing topological data structu...
dagster python library for building data application :...
data-science-ipython-notebooks science python notebook : deep learning ( tens...
industry-machine-learning curated list of applied machine learning and d...
ttk - topological data analysis and visualization ...

tm.train_recommender(n_neighbors=3, metric='cosine')
def show_results(query):
    for res in tm.recommend(query, n=5, n_neighbors=3):
        print(res[0])
        print()

show_results('search')
and memory - efficient ann with a subset - search functionality

simple elasticsearch plugin wrapping around the search endpoint to provide rocchio query expansion

query expansion in semantic meta - search engine . the resulting expansion system is called wiki - metasemantik .

search engine with query expansion

. com repo to accompany the dice . com ' vector in search ' talk by simon hughes , from the activate 2018 search conference , and the ' searching with vector ' talk from haystack 2019 ( u ). build upon my conceptual search and semantic search work from 2015

show_results('query')
and memory - efficient ann with a subset - search functionality

simple elasticsearch plugin wrapping around the search endpoint to provide rocchio query expansion

rocchio query expansion - similar to " related search :" found at popular search engine but based on relevant document selected by the end - user

search engine with query expansion

. com repo to accompany the dice . com ' vector in search ' talk by simon hughes , from the activate 2018 search conference , and the ' searching with vector ' talk from haystack 2019 ( u ). build upon my conceptual search and semantic search work from 2015

show_results('information retrieval')
' s algorithm and general image generation theory and practice research , including pix2pix , cyclegan , ugatit , dcgan , singan and vae model ( tensorflow2 implementation ). deepnude的算法以及通用gan图像生成的理论与实践研究 。

image detector with rest interface developed using kera and flask

image retrieval in pytorch : training and evaluating cnns for image retrieval in pytorch

oxford and paris : large - scale image retrieval benchmarking

: octocat : various hashing method for image retrieval and serf a the baseline

Visualizing repository 2D projection

Remark: ktrain also has visualization capability but I liked UMAP better

umap_red = umap.UMAP(metric='precomputed') 
umap_features = umap_red.fit_transform(metrics.pairwise.cosine_distances(reduced_term_document_matrix, reduced_term_document_matrix))
representatives = pd.concat(representative_repos)
representative_indices = np.where(starred_repo_df.index.isin(representatives.index))
umap_df = pd.DataFrame(umap_features)
umap_df.columns = ['x', 'y']
umap_df['name'] = starred_repo_df.index
umap_df['topic'] =  np.argmax(reduced_term_document_matrix, axis=1)
hv.notebook_extension('bokeh','matplotlib')


opts.defaults(
    opts.RGB(width=400, height=400, xaxis=None, yaxis=None, show_grid=False, bgcolor="black"))



points = hv.Points(umap_df)
labels = hv.Labels(umap_df, ['x','y'], 'name')

points.opts(
    opts.Points(
        color='topic',
        cmap='Category20',
        tools=['zoom_in', 'zoom_out', 'hover'], width=800, height=600),
    opts.Overlay(width=800, height=600),
)
</img> </img> </img>