import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import time
import requests
import json
import csv
import datetime
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
warnings.filterwarnings('ignore')# PART 1: GETTING DATA Data is extracted from Reddit using the Pushshift API as documented here:
https://github.com/pushshift/api
Getting dates to extract data between
The Pushshift API only returns at most 1000 posts with each request, so I create a list of dates to pull ~1000 posts between these dates.
dates_list = []
# Making list of dates; Each January 1st and June 1st from 2015 until January 1st 2020
for i in range(10,21):
dates_list.append('01/01/20'+str(i))
dates_list.append('01/06/20'+str(i))
# Popping June 2020 since it hasn't happened yet
dates_list.pop()
dates_list['01/01/2010',
'01/06/2010',
'01/01/2011',
'01/06/2011',
'01/01/2012',
'01/06/2012',
'01/01/2013',
'01/06/2013',
'01/01/2014',
'01/06/2014',
'01/01/2015',
'01/06/2015',
'01/01/2016',
'01/06/2016',
'01/01/2017',
'01/06/2017',
'01/01/2018',
'01/06/2018',
'01/01/2019',
'01/06/2019',
'01/01/2020']
Converting dates to Unix timestamp
Returns Unix timestamp that Pushshift API requires for dates
def getTimeStamp(date_input):
return time.mktime(datetime.datetime.strptime(date_input, "%d/%m/%Y").timetuple())
dates = [int(getTimeStamp(date)) for date in dates_list]
dates[1262332800,
1275375600,
1293868800,
1306911600,
1325404800,
1338534000,
1357027200,
1370070000,
1388563200,
1401606000,
1420099200,
1433142000,
1451635200,
1464764400,
1483257600,
1496300400,
1514793600,
1527836400,
1546329600,
1559372400,
1577865600]
Getting Pushshift data
Returns the top 1000 posts in the given subreddit between the given times.
Code modified from the following article:
https://medium.com/@RareLoot/using-pushshifts-api-to-extract-reddit-submissions-fb517b286563
def getPushshiftData(after, before, sub):
url = ('https://api.pushshift.io/reddit/search/submission/?size=1000&after='+
str(after)+'&before='+str(before)+'&subreddit='+str(sub)+'&sort_type=score'+'&sort=desc')
print(url)
r = requests.get(url)
data = json.loads(r.text)
return data['data']Getting all the titles between the dates chosen earlier
Here I loop through all the dates above and get the top ~1000 posts in the chosen subreddit.
I end up with 9065 Onion headlines and 15432 “fake”-Onion headlines from r/NotThenOnion.
I then keep the first 9000 and first 15000 for easier batching.
def getTitles(subreddit):
titles_new = []
titles = []
for i in range(len(dates)-1):
# Setting up dates
after = dates[i]
before = dates[i+1]
# Getting subreddit data between the dates after and before from r/NotTheOnion
raw_json = getPushshiftData(after,before,subreddit)
# Extracting just the title
titles_new = [post['title'] for post in raw_json]
# Appending new data on
titles = titles + titles_new
# A few posts were extracted twice, set gets rid of duplicates
titles = list(set(titles))
return titles
not_onion = getTitles('nottheonion')
onion = getTitles('theonion')
onion = onion[:9000]
not_onion = not_onion[:15000]https://api.pushshift.io/reddit/search/submission/?size=1000&after=1262332800&before=1275375600&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1275375600&before=1293868800&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1293868800&before=1306911600&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1306911600&before=1325404800&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1325404800&before=1338534000&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1338534000&before=1357027200&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1357027200&before=1370070000&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1370070000&before=1388563200&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1388563200&before=1401606000&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1401606000&before=1420099200&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1420099200&before=1433142000&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1433142000&before=1451635200&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1451635200&before=1464764400&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1464764400&before=1483257600&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1483257600&before=1496300400&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1496300400&before=1514793600&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1514793600&before=1527836400&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1527836400&before=1546329600&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1546329600&before=1559372400&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1559372400&before=1577865600&subreddit=nottheonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1262332800&before=1275375600&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1275375600&before=1293868800&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1293868800&before=1306911600&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1306911600&before=1325404800&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1325404800&before=1338534000&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1338534000&before=1357027200&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1357027200&before=1370070000&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1370070000&before=1388563200&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1388563200&before=1401606000&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1401606000&before=1420099200&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1420099200&before=1433142000&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1433142000&before=1451635200&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1451635200&before=1464764400&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1464764400&before=1483257600&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1483257600&before=1496300400&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1496300400&before=1514793600&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1514793600&before=1527836400&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1527836400&before=1546329600&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1546329600&before=1559372400&subreddit=theonion&sort_type=score&sort=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1559372400&before=1577865600&subreddit=theonion&sort_type=score&sort=desc
Converting to pandas dataframe
Labeling Onion headlines as 1, and r/NotTheOnion headlines as 0.
df1= pd.DataFrame({'text':onion})
df1['label'] = 1
df2 = pd.DataFrame({'text':not_onion})
df2['label'] = 0
# Combining both datasets
df = pd.concat([df1,df2])
# Shuffling the dataset
df = df.sample(frac=1).reset_index(drop=True)
# Saving the 'uncleaned' dataframe to a csv file
df.to_csv('OnionOrNot.csv', index = False)
# Converting all text to lowercase, fixing ampersands and getting rid
# of dashes and apostrophes as they can mess up the dictionary
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.replace(r'&', 'and')
df['text'] = df['text'].str.replace(r'-', ' ')
df['text'] = df['text'].str.replace(r'[^\s\w]','')
# Saving the dataframe to a csv file
df.to_csv('OnionOrNotClean.csv')
df.head()| text | label | |
|---|---|---|
| 0 | entire facebook staff laughs as man tightens p... | 1 |
| 1 | muslim woman denied soda can for fear she coul... | 0 |
| 2 | bold move hulu has announced that theyre gonna... | 1 |
| 3 | despondent jeff bezos realizes hell have to wo... | 1 |
| 4 | for men looking for great single women online ... | 1 |
Reading in dataframe
Running this when I return to the project so I don’t have to use the Pushshift API etc. again.
df = pd.read_csv('OnionOrNotClean.csv', index_col = 0)
df.head()| text | label | |
|---|---|---|
| 0 | entire facebook staff laughs as man tightens p... | 1 |
| 1 | muslim woman denied soda can for fear she coul... | 0 |
| 2 | bold move hulu has announced that theyre gonna... | 1 |
| 3 | despondent jeff bezos realizes hell have to wo... | 1 |
| 4 | for men looking for great single women online ... | 1 |
PART 2: ENCODING WORDS AS NUMBERS
Getting all the words in the training data
vocab_set = set()
sentence_lengths = []
for i in range(len(df)):
# Updates adds all items to the set, re.split splits the text into words
sentence_words = re.split(r'\s',df.iloc[i]['text'])
vocab_set.update(sentence_words)
sentence_lengths.append(len(sentence_words))Converting the words to a dictionary
This way we can map the words in the dataframe to lists of numbers
vocab_list = list(vocab_set)
vocab_dict = {vocab_list[i-1]: i for i in range(1, len(vocab_list)+1)}Creating column of the words mapped to numbers
max_length = max(sentence_lengths)
def toNumbers(row):
words = re.findall(r'([\w]+)', row['text'])
nums = np.array([vocab_dict[words[j]] for j in range(len(words))])
return np.pad(nums, (0, max_length - len(nums)), mode='constant')nums = df.apply(lambda row: toNumbers(row), axis=1)
df['nums'] = nums
df['nums'].head()0 [10101, 15701, 24365, 6689, 22221, 4330, 4928,...
1 [6556, 7335, 1523, 21250, 6690, 23567, 18468, ...
2 [20493, 17894, 4253, 9925, 21346, 24068, 7515,...
3 [18219, 15505, 9902, 24892, 16634, 10504, 810,...
4 [16068, 3826, 16392, 14837, 1613, 5793, 15082,...
Name: nums, dtype: object
Converting to Numpy arrays
labels = np.asarray(df['label'].values)
features = np.stack(df['nums'].values)
features.shape, labels.shape((24000, 64), (24000,))
PART 3: BUILDING, COMPILING, AND FITTING THE MODEL
def get_compiled_model():
embedding_dim=16
model = tf.keras.Sequential([
tf.keras.layers.Embedding(len(vocab_set)+1, 64),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
return model
model = get_compiled_model()
model.fit(features, labels, batch_size=32, epochs=5, verbose=2, validation_split=0.2);Train on 19200 samples, validate on 4800 samples
Epoch 1/5
19200/19200 - 173s - loss: 0.4103 - accuracy: 0.8130 - val_loss: 0.3051 - val_accuracy: 0.8715
Epoch 2/5
19200/19200 - 152s - loss: 0.1713 - accuracy: 0.9377 - val_loss: 0.3448 - val_accuracy: 0.8652
Epoch 3/5
19200/19200 - 116s - loss: 0.0685 - accuracy: 0.9778 - val_loss: 0.4366 - val_accuracy: 0.8554
Epoch 4/5
19200/19200 - 121s - loss: 0.0348 - accuracy: 0.9891 - val_loss: 0.6293 - val_accuracy: 0.8344
Epoch 5/5
19200/19200 - 127s - loss: 0.0242 - accuracy: 0.9923 - val_loss: 0.6790 - val_accuracy: 0.8442
Tuning parameters, modifying model, etc.
Trial 1: ~85% validation accuracy around epoch 5
embedding_dim=16
model = keras.Sequential([
layers.Embedding(len(vocab_set)+1, embedding_dim),
layers.GlobalAveragePooling1D(),
layers.Dense(16, activation='relu'),
layers.Dense(1, activation='sigmoid')
])```
### Trial 2: ~85% validation accuracy around epoch 5
No discernable change in accuracy tuning embedding_dim
embedding_dim=32
model = keras.Sequential([ layers.Embedding(len(vocab_set)+1, embedding_dim), layers.GlobalAveragePooling1D(), layers.Dense(16, activation=‘relu’), layers.Dense(1, activation=‘sigmoid’)])```
Trail 3: ~87% validation accuracy on epoch 1
Starts overfitting after epoch 1
embedding_dim=16
model = tf.keras.Sequential([
tf.keras.layers.Embedding(len(vocab_set)+1, 64),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])```
### Trail 4: ~87% validation accuracy on epoch 1
Again starts overfitting after epoch 1
embedding_dim=16
model = tf.keras.Sequential([ tf.keras.layers.Embedding(len(vocab_set)+1, 64), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)), tf.keras.layers.Dense(64, activation=‘relu’), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(1, activation=‘sigmoid’)])```