Visualization of jokes, per polititian: 2013~2017¶

without NLP!¶

Let's assume those are the people we want to know more about

name_list = ["Hilary", "Clinton", "Obama", "Bush", "Trump", "Biden", "Cheney", "Ajit", "McCain", "Palin"]

from __future__ import division
import nltk
from nltk.stem.porter import PorterStemmer
import math
import numpy
import pandas as pd
import re
import numpy as np

jokes_df = pd.read_csv("all_r_jokes.csv", delimiter=',',quotechar='|')
jokes_df.columns=['id', 'timestamp', 'q', 'a']

create function so that we can count the appearance of each name¶

def countStr(doc,subs):
    if (doc is None ):
        print('doc is None')
        return 0
    if (not isinstance(doc, str)):
        print('doc is not a String')
        return 0
    doc = re.sub('[^a-z0-9]', ' ', doc.lower())
    return doc.count(subs.lower())

for name in name_list:
    jokes_df[name] = jokes_df.apply(lambda x: countStr(x['q'], name.lower()) + countStr(x['a'], name.lower()), axis=1)

doc is not a String
doc is not a String
doc is not a String
doc is not a String
doc is not a String
doc is not a String
doc is not a String
doc is not a String
doc is not a String
doc is not a String

Instead of registering boolean of which name occurs, store the information of occurance of each name in the joke. Boolean information can be deduced from occurance numbers, but not the other way around.

A peak of the info we hnave here:

jokes_df[jokes_df['Obama']>1].sample(2)

Plot and animate the entire thing!¶

# get the list of timestamps to iterate over. 
from datetime import datetime
years = range(2014, 2018)
months = range(1, 13)
timestamp_list = []
for y in years:
    for m in months: 
        timestamp_list.append(datetime(y, m, 1).timestamp())
        timestamp_list.append(datetime(y, m, 10).timestamp())
        timestamp_list.append(datetime(y, m, 20).timestamp())

def toDateStr(t):
    s = datetime.fromtimestamp(s)
    return s.strftime('%Y-%m-%d')

compile data to plot¶

plt_data = []
for i in range(len(timestamp_list)-1):
    startt = timestamp_list[i]
    endt = timestamp_list[i+1]
    currjokes_df = jokes_df[(jokes_df['timestamp']>startt) & (jokes_df['timestamp']<= endt )]
    curr_name_count = []
    for n in name_list: 
        curr_name_count.append(currjokes_df[n].sum())
    plt_data.append([startt, endt, curr_name_count])

import matplotlib.pyplot as plt
y_pos = range(len(name_list))

def animate(i):
    startt, endt, curr_name_count = plt_data[i]
    plt.cla()
    plt.bar(y_pos, curr_name_count, align='center', alpha=0.5, color='b')
    plt.xticks(y_pos, name_list, fontsize = 10)
    plt.yticks(fontsize = 10)
    start_time_str = datetime.fromtimestamp(startt).strftime('%Y-%m-%d')
    end_time_str = datetime.fromtimestamp(endt).strftime('%Y-%m-%d')
    curr_label = start_time_str + ' ~ ' + end_time_str
    plt.ylabel(curr_label, fontsize = 15)
    #plt.show()

#animate(77)

%matplotlib notebook
import matplotlib.pyplot as plt
from matplotlib import animation, rc
from IPython.display import HTML

fig=plt.figure()
n = len(plt_data)
anim=animation.FuncAnimation(fig,animate,repeat=False,blit=False,frames=n,
                             interval=500)
#plt.show()
HTML(anim.to_jshtml())

MLBlag

NLP: Short Sentence comparison: No NLP (Part 4: How much do we make fun of Trump?)

statistics of jokes of the election week, Nov. 8th, 2016 (Scroll down for the animated version. This is a screen shot.)

Visualization of jokes, per polititian: 2013~2017¶

without NLP!¶

create function so that we can count the appearance of each name¶

Plot and animate the entire thing!¶

compile data to plot¶

Now export! It's a bit tricky because we want to export the animation as well.¶

	id	timestamp	q	a	Hilary	Clinton	Obama	Bush	Trump	Biden	Cheney	Ajit	McCain	Palin
163333	7pjqya	1.515627e+09	Bush, Clinton, Obama, and Trump decided to hav...	Trump went first. He finished the race in 10 m...	0	2	2	2	2	0	0	0	0	0
10989	16bj5a	1.357836e+09	Obama, Michelle, and Oprah are on a plane toge...	.... Obama says, i can drop 1 $1,000 bill and ...	0	0	2	0	0	0	0	0	0	0