This notebook builds LDA topic models on the headlines of 13,000+ fact-checking stories in the Tattle archive.
Because the body text of fact-checking stories often contains text which is not related to the misinformation being fact-checked. The headlines, however, are focused on the misinformation.
Latent Dirichlet Allocation is an unsupervised, probabilistic model that generates topic-document and word-topic probability distributions from a collection of text documents. Topics are themes that occur in documents. Here, document = story headline.
import os
import requests
import time
from time import sleep
from random import uniform
import datetime
from datetime import date
import csv
from pymongo import MongoClient
from dotenv import load_dotenv
load_dotenv()
import os
from os import environ
import re
import numpy as np
import pandas as pd
from pprint import pprint
import nltk
from nltk.corpus import stopwords
import spacy
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from gensim.models import ldamodel
from gensim.models import CoherenceModel
import re
from langdetect import detect
from gensim.models.phrases import Phrases, Phraser
from nltk import FreqDist
from nltk.corpus import RegexpTokenizer as regextoken
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
import logging
logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import googletrans
from googletrans import Translator
import pyLDAvis
import pyLDAvis.gensim
from gensim import similarities
def initialize_mongo():
mongo_url = "mongodb+srv://os.environ.get("FACTCHECK_DB_USERNAME"):os.environ.get("FACTCHECK_DB_PASSWORD")@tattle-data-fkpmg.mongodb.net/test?retryWrites=true&w=majority&ssl=true&ssl_cert_reqs=CERT_NONE"
cli = MongoClient(mongo_url)
db = cli[os.environ.get("FACTCHECK_DB_NAME")]
coll = db[os.environ.get("FACTCHECK_DB_COLLECTION")]
if coll.count_documents({}) > 0:
return coll
else:
print("Error accessing Mongo collection")
sys.exit()
coll = initialize_mongo()
df = pd.DataFrame.from_records(coll.find({}))
df.to_csv("factchecking_stories.csv", index=False)
df = pd.read_csv("factchecking_stories.csv")
df = df.drop_duplicates()
# Snapshot of headlines
df["headline"][0:3]
0 Madhu Kishwar tweets photoshopped image of Amu... 1 Photoshopped promo of ABP News over Chandrayaa... 2 Video of scuffle between men in khaki uniform ... Name: headline, dtype: object
# Save headlines in a variable
raw_data = df["headline"].values.tolist()
# Defining a function to remove accented characters in the headlines
def data_dict(sentences):
return dict((sentence, ", ".join(simple_preprocess(str(sentence), deacc=True, max_len=100))) for sentence in sentences)
result = data_dict(raw_data)
# Separating non-English headlines using regex
pat = re.compile("[^\x00-\x7F]") # matches non-English characters
non_eng = [k for k,v in result.items() if pat.search(v)]
eng = [k for k,v in result.items() if not pat.search(v)]
print(len(eng), len(non_eng))
8298 5241
Googletrans is a free library that sends translation requests to the Google Translate API. Random time delays between requests are advised, else Google may (and probably will) block your ip address.
Translating the 5200+ non-English headlines took approximately 6.5 hours.
# Translating non-English headlines using googletrans library
translator = Translator()
translations = []
for doc in non_eng:
translations.append(translator.translate(doc).text)
time.sleep(uniform(3,5))
# Saving the original and translated headlines for future reference
translated_headlines = dict(zip(non_eng, translations))
translations_df = pd.DataFrame(translated_headlines.items(), columns = ["headline", "translation"])
translations_df["original_english"] = 0
translations_df = translations_df.append(pd.DataFrame(eng, columns=['headline']), ignore_index=True, sort=True)
translations_df["original_english"].fillna(value=1, inplace = True)
translations_df["original_english"] = translations_df["original_english"].astype(int)
translations_df.to_csv("headlines_with_translations.csv")
# Combining the headlines
all_headlines = eng + translations
# Tokenizing the headlines
all_tokens = list(sent_to_words(all_headlines))
# Creating stop words list
stop_words = stopwords.words("english")
# Adding domain words
stop_words.extend(["fake", "fact", "check", "checked", "factcheck", "news", "false",
"falsely", "true", "truth", "viral", "video", "image", "picture",
"photo", "claim", "claiming", "share", "clip", "misleading"])
# Stop word removal
data_stopped = [[word for word in doc if word not in stop_words] for doc in all_tokens]
# Creating bigrams
bigram = gensim.models.Phrases(data_stopped, min_count=10)
for idx in range(len(data_stopped)):
for token in bigram[data_stopped[idx]]:
if '_' in token:
# If token is bigram, add it to document
data_stopped[idx].append(token)
data_with_bigrams = data_stopped
# Lemmatizing i.e. reducing words to their root form
# Including only nouns as this improves both topic interpretability and coherence scores
def lemmatization(docs, allowed_postags=["NOUN", "PROPN"]):
nlp = spacy.load("en_core_web_sm")
docs_out = []
for sent in docs:
doc = nlp(" ".join(sent))
docs_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return docs_out
data_lemmatized = lemmatization(data_with_bigrams, allowed_postags=["NOUN", "PROPN"])
# Removing any stopwords created because of lemmatization
data_cleaned = [[word for word in doc if word not in stop_words] for doc in data_lemmatized]
Topic modelling with the Gensim library involves documents, corpus, vectors and bag of words. These are explained here - https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html
# Creating a dictionary
id2word = corpora.Dictionary(data_cleaned)
# Creating a document-term matrix
print('Number of unique tokens: %d' % len(id2word))
id2word.filter_extremes(no_below = 20)
# Creating a document-term matrix
corpus = [id2word.doc2bow(doc) for doc in data_cleaned]
Number of unique tokens: 7585
Topic coherence is an evaluation metric for topic models that measures the degree of semantic similarity between high scoring words in the topic. The graph below helps find an appropriate 'k' number of topics to model.
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
"""
Compute c_v coherence for various number of topics
Parameters:
----------
dictionary : Gensim dictionary
corpus : Gensim corpus
texts : List of input texts
limit : Max num of topics
Returns:
-------
model_list : List of LDA topic models
coherence_values : Coherence values corresponding to the LDA model with respective number of topics
"""
coherence_values = []
model_list = []
for num_topics in range(start, limit, step):
model=ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=5, per_word_topics = True, chunksize=100, alpha='auto',
eta='auto', eval_every=1, random_state = 0)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
return model_list, coherence_values
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_cleaned, start=2, limit=20, step=1)
# Show graph
limit=20; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
2020-05-06 16:54:18,831 [20296] WARNING gensim.models.ldamodel:145: [JupyterRequire] updated prior is not positive 2020-05-06 16:54:18,917 [20296] WARNING gensim.models.ldamodel:145: [JupyterRequire] updated prior is not positive 2020-05-06 16:54:18,989 [20296] WARNING gensim.models.ldamodel:145: [JupyterRequire] updated prior is not positive 2020-05-06 16:54:19,051 [20296] WARNING gensim.models.ldamodel:145: [JupyterRequire] updated prior is not positive 2020-05-06 16:54:41,787 [20296] WARNING gensim.models.ldamodel:145: [JupyterRequire] updated prior is not positive 2020-05-06 16:54:41,886 [20296] WARNING gensim.models.ldamodel:145: [JupyterRequire] updated prior is not positive 2020-05-06 16:55:03,933 [20296] WARNING gensim.models.ldamodel:145: [JupyterRequire] updated prior is not positive
We choose k=7 as k=11 produced some topics which were not as meaningful.
lda_model = ldamodel.LdaModel(corpus = corpus, num_topics = 7, id2word = id2word, chunksize=100, alpha='auto',
eta='auto', passes = 5, random_state = 0)
# Printing the topic-word probabilities
pprint(lda_model.print_topics())
[(0, '0.094*"bjp" + 0.057*"people" + 0.057*"medium" + 0.044*"modi" + ' '0.038*"social_media" + 0.036*"temple" + 0.032*"verification" + ' '0.030*"state" + 0.027*"stock" + 0.026*"statement"'), (1, '0.123*"virus" + 0.066*"box" + 0.064*"corona_virus" + 0.056*"year" + ' '0.041*"coronavirus" + 0.040*"student" + 0.039*"coro" + 0.035*"china" + ' '0.032*"message" + 0.031*"girl"'), (2, '0.101*"india" + 0.047*"post" + 0.041*"muslim" + 0.041*"country" + ' '0.041*"lockdown" + 0.036*"rally" + 0.033*"riot" + 0.030*"president" + ' '0.027*"house" + 0.025*"shah"'), (3, '0.140*"police" + 0.089*"woman" + 0.081*"name" + 0.049*"man" + 0.049*"child" ' '+ 0.038*"leader" + 0.028*"worker" + 0.027*"rumor" + 0.026*"khan" + ' '0.025*"death"'), (4, '0.090*"gandhi" + 0.069*"minister" + 0.041*"election" + 0.041*"event" + ' '0.040*"rahul_gandhi" + 0.035*"prime_minister" + 0.034*"color" + ' '0.032*"body" + 0.028*"vote" + 0.026*"communal_color"'), (5, '0.058*"rape" + 0.051*"time" + 0.037*"army" + 0.036*"evidence" + ' '0.036*"food" + 0.030*"mock" + 0.030*"mock_drill" + 0.028*"drill" + ' '0.028*"day" + 0.026*"footage"'), (6, '0.091*"delhi" + 0.081*"caa" + 0.055*"protest" + 0.038*"government" + ' '0.038*"person" + 0.035*"attack" + 0.032*"congress" + 0.030*"kashmir" + ' '0.030*"money" + 0.028*"pradesh"')]
The above topic-word probability distributions can be interpreted as 7 topics or themes that describe the story headlines:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model3, corpus, id2word, sort_topics=True, n_jobs=1)
vis
2020-05-06 17:18:32,332 [20296] WARNING py.warnings:110: [JupyterRequire] /Users/kruttikanadig/anaconda3/lib/python3.7/site-packages/pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
return pd.concat([default_term_info] + list(topic_dfs))
# Redoing text processing with adjectives, verbs and adverbs included
def lemmatization(docs, allowed_postags=["NOUN", "PROPN", "VERB", "ADJ", "ADV"]):
nlp = spacy.load("en_core_web_sm")
docs_out = []
for sent in docs:
doc = nlp(" ".join(sent))
docs_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return docs_out
data_lemmatized = lemmatization(data_with_bigrams, allowed_postags=["NOUN", "PROPN", "VERB", "ADJ", "ADV"])
# Removing any stopwords created because of lemmatization
data_cleaned = [[word for word in doc if word not in stop_words] for doc in data_lemmatized]
# Plot frequency distribution of most common words in the headlines
all_sents = [" ".join(doc) for doc in data_cleaned]
tokenizer = regextoken("[a-zA-Z0-9]+")
all_words = tokenizer.tokenize(str(all_sents))
fd = FreqDist()
for word in all_words:
fd[word] += 1
# Plotting the top 50 most frequent words
# plt.figure(figsize = (15, 10))
# fd.plot(50)
# plt.show()
fd_sorted={k: v for k, v in sorted(fd.items(), key=lambda item: item[1], reverse=True)}
fd_sorted=dict(list(fd_sorted.items())[0: 100])
plt.figure(figsize = (15, 10))
plt.suptitle("Top words in 13,539 story headlines on Indian factchecking sites (as of May 2020)", fontsize= 15)
plt.title("source: Tattle archive", fontdict = {'fontsize': 12})
plt.xlabel(xlabel = "Word", fontdict = {'fontsize': 13})
plt.ylabel(ylabel = "Count", fontdict = {'fontsize': 13})
plt.xticks(rotation=90, size =11)
plt.yticks(np.arange(0, 2000, 100))
plt.plot(fd_sorted.keys(), fd_sorted.values())
plt.grid()
plt.tight_layout(pad=4)
plt.savefig("word_plot.png", bbox_inches='tight')
plt.show()
# # Plot frequency distribution of most common word-pairs in the headlines
bigrams_list=[]
for idx in range(len(data_cleaned)):
for token in bigram[data_cleaned[idx]]:
if '_' in token:
# If token is bigram, add it to document
bigrams_list.append(token)
fd = FreqDist()
for b in bigrams_list:
fd[b] += 1
fd_sorted={k: v for k, v in sorted(fd.items(), key=lambda item: item[1], reverse=True)}
fd_sorted=dict(list(fd_sorted.items())[0: 100])
plt.figure(figsize = (15, 10))
plt.suptitle("Top bigrams in 13,539 story headlines on Indian factchecking sites (as of May 2020)", fontsize= 15)
plt.title("source: Tattle archive", fontdict = {'fontsize': 12})
plt.xlabel(xlabel = "Bigram", fontdict = {'fontsize': 13})
plt.ylabel(ylabel = "Count", fontdict = {'fontsize': 13})
plt.xticks(rotation=90, size =9)
plt.yticks(np.arange(0, 800, 50))
plt.plot(fd_sorted.keys(), fd_sorted.values())
plt.grid()
plt.tight_layout(pad=4)
plt.savefig("bigram_plot.png", bbox_inches='tight')
plt.show()
We have performed topic modelling and ngram analysis on 13,000+ fact-checking story headlines, including headlines that were originally in Indian languages. Some possible applications of this analysis are: