# install required packages
!pip install ipython==7.10.0
!pip install pyLDAvis
# import required packages
# for data processing
import json, re, nltk, requests
import numpy as np
import pandas as pd
import string
import os
# for text processing
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk import word_tokenize, sent_tokenize, ngrams, pos_tag, RegexpParser
#from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
# for LDA model
import gensim, operator
from scipy import spatial
from gensim.models import KeyedVectors
from gensim.models import ldamodel
from gensim.corpora.dictionary import Dictionary
from gensim.models import CoherenceModel
#import pyLDAvis
#import pyLDAvis.sklearn
#pyLDAvis.enable_notebook()
# for visualization
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models
# do not print deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.decomposition import LatentDirichletAllocation
stopwords = set(nltk.corpus.stopwords.words('english'))
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package wordnet to /root/nltk_data... [nltk_data] Package wordnet is already up-to-date!
# loading all 4 datasets into one file
text_data = []
# nasa dataset
with open('/content/webhose_nasa.json', 'r') as f:
for line in f.readlines():
text_data.append(json.loads(line))
# blueorigin dataset
with open('/content/webhose_blueorigin.json', 'r') as f:
for line in f.readlines():
text_data.append(json.loads(line))
# virgingalactic dataset
with open('/content/webhose_virgingalactic.json', 'r') as f:
for line in f.readlines():
text_data.append(json.loads(line))
# spacex dataset
with open('/content/webhose_spacex.json', 'r') as f:
for line in f.readlines():
text_data.append(json.loads(line))
# display total number of articles in the dataset
print("Total number of articles: " + str(len(text_data)))
Total number of articles: 3327
# display a sample of the json item
text_data[2]
{'thread': {'uuid': '4468adbc05a93375421676846672b73bf368769c', 'url': 'https://www.cvbj.biz/2021/07/20/the-sentinel-6-satellite-sentinel-of-the-oceans-ready-for-launch-this-saturday-21/', 'site_full': 'www.cvbj.biz', 'site': 'cvbj.biz', 'site_section': 'http://cvbj.biz/feed/', 'site_categories': ['media'], 'section_title': 'CVBJ', 'title': 'The Sentinel 6 satellite, sentinel of the oceans, ready for launch this Saturday 21', 'title_full': 'The Sentinel 6 satellite, sentinel of the oceans, ready for launch this Saturday 21', 'published': '2021-07-20T04:08:00.000+03:00', 'replies_count': 0, 'participants_count': 1, 'site_type': 'news', 'country': 'US', 'spam_score': 0.01, 'main_image': '', 'performance_score': 0, 'domain_rank': None, 'reach': None, 'social': {'facebook': {'likes': 0, 'comments': 0, 'shares': 0}, 'gplus': {'shares': 0}, 'pinterest': {'shares': 0}, 'linkedin': {'shares': 0}, 'stumbledupon': {'shares': 0}, 'vk': {'shares': 0}}}, 'uuid': '4468adbc05a93375421676846672b73bf368769c', 'url': 'https://www.cvbj.biz/2021/07/20/the-sentinel-6-satellite-sentinel-of-the-oceans-ready-for-launch-this-saturday-21/', 'ord_in_thread': 0, 'parent_url': None, 'author': 'Cvbj', 'published': '2021-07-20T04:08:00.000+03:00', 'title': 'The Sentinel 6 satellite, sentinel of the oceans, ready for launch this Saturday 21', 'text': 'The Sentinel 6 satellite will be the sentinel of the oceans of the European Earth observation network Copernicus. Christened Michael Freilich in honor of the oceanographer and former director of NASA’s Earth Observation Program, the first of two identical satellites that make up Sentinel 6 will go into orbit this Saturday from Vandenberg Air Force Base, California. It will be launched into orbit by a Falcon 9 rocket from Elon Musk’s trendy space company, Space X. The launch can be followed live on Saturday at 17:45 CET on the dedicated page created by Eumetsat or on the page of the European Space Agency (ESA). In social networks, the label will continue # Sentinel6 This concentrate of technology will make it possible to accurately measure the rise in the level of the oceans which, contrary to what may appear at first glance, is not homogeneous and therefore difficult to measure. One of its missions will be to secure the fourth decade of sea level records, with an accuracy unthinkable 30 years ago, thanks to its Poseidon-4 instrument, an improved radar altimeter that will continue the work of the TOPEX Poseidon and Jason satellites of the POT. It will also provide detailed information on the atmosphere, sea surface, currents and waves that will advance the investigations of oceanographers and climate scientists. Record of international collaboration between agencies The Sentinel 6 satellite duo will map 95% of the non-icy sea surface every 10 days. “Although Sentinel-6 is one of the missions of the Copernicus family of the European Union, its implementation is the result of the unique collaboration between ESA, NASA, Eumetsat and NOAA, with the contribution of the French space agency CNES” says the statement presented today by the European Space Agency (ESA). At first, the two Sentinel 6 satellites will make a 12-month tandem flight, only 30 seconds apart, which will allow scientists to calibrate their results and make them homogeneous with measurements from previous NASA missions. The European Copernicus network makes your data freely accessible to the public. Many applications that we use on a daily basis are based on your system. In close collaboration with other organizations such as the European Center for Medium-Term Forecasts or the European Space Agency, Copernicus proposes various services such as Atmospheric Monitoring, Climate Change, an Emergency Management Service or for the Earth. All of them use the data from the 7 satellites that the program already has in orbit. Each set of satellites has its characteristic capabilities. Sentinel 1 uses radar imagery, Sentinel 2 and 3 satellites use optical imagery, and Sentinel 5 satellite has detectors capable of measuring changes in the atmosphere and the concentrations of some greenhouse gases. The Sentinel 6B satellite will launch in 2025 if all goes according to plan. Sentinel 4 will remain in geostationary orbit off Europe and North Africa measuring changes in the atmosphere and its launch is scheduled within 3 years. All Copernicus data is available through platforms such as Sentinel Hub or DIAS platforms, for the more experienced.', 'highlightText': '', 'highlightTitle': '', 'highlightThreadTitle': '', 'language': 'english', 'external_links': [], 'external_images': [], 'entities': {'persons': [{'name': 'copernicus', 'sentiment': 'none'}, {'name': 'michael freilich', 'sentiment': 'none'}], 'organizations': [{'name': 'nasa', 'sentiment': 'none'}, {'name': 'european space agency', 'sentiment': 'none'}, {'name': 'earth observation program', 'sentiment': 'none'}, {'name': 'falcon', 'sentiment': 'none'}], 'locations': [{'name': 'california', 'sentiment': 'none'}, {'name': 'vandenberg air force base', 'sentiment': 'none'}]}, 'rating': None, 'crawled': '2021-07-20T04:20:05.005+03:00', 'updated': '2021-07-20T04:20:05.005+03:00'}
# dropping ALL duplicte values
df_feeds = pd.DataFrame(text_data[1:],columns=text_data[0])
df_feeds.drop_duplicates(subset ="text",inplace = True)
print("Total number of articles after deduplication: " + str(len(df_feeds)))
Total number of articles after deduplication: 2411
# create a function to remove stopwords and punctuations
stopwords = set(nltk.corpus.stopwords.words('english'))
punctuation = set(string.punctuation)
def text_cleanup(input_text):
one = " ".join([i for i in input_text.lower().split() if i not in stopwords])
two = "".join(re.sub(r'[^a-zA-Z ]', '', i) for i in one if i not in punctuation)
three = [WordNetLemmatizer().lemmatize(i) for i in two.split()]
return three
#remove stopwords, punctuation and lemmatize
text_data = df_feeds[['text']].applymap(text_cleanup)['text']
# Create Dictionary
dictionary = Dictionary(text_data)
dictionary.filter_extremes(no_below=10, no_above=0.8)
# Term Document Frequency
corpora = [dictionary.doc2bow(doc) for doc in text_data]
# View
print(corpora[:1])
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 2), (14, 1), (15, 1), (16, 2), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 5), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 2), (47, 1), (48, 1), (49, 2), (50, 3), (51, 1), (52, 4), (53, 2), (54, 1), (55, 1), (56, 1), (57, 2), (58, 1), (59, 2), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 3), (69, 1), (70, 1), (71, 1), (72, 8), (73, 1), (74, 1), (75, 3), (76, 1), (77, 1), (78, 14), (79, 1), (80, 1), (81, 1), (82, 1), (83, 8), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 2), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 3), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 3), (111, 1), (112, 1), (113, 1), (114, 1), (115, 1), (116, 1), (117, 1), (118, 1), (119, 1), (120, 2), (121, 2), (122, 2), (123, 1), (124, 2), (125, 1), (126, 3), (127, 7), (128, 1), (129, 1), (130, 1), (131, 5), (132, 1), (133, 1), (134, 1), (135, 1), (136, 1), (137, 3), (138, 2), (139, 1), (140, 1), (141, 1), (142, 1), (143, 2), (144, 4), (145, 1), (146, 1), (147, 1), (148, 1), (149, 1), (150, 1), (151, 1), (152, 1), (153, 1), (154, 1), (155, 1), (156, 1), (157, 1), (158, 2), (159, 1), (160, 4), (161, 1), (162, 1), (163, 1), (164, 1), (165, 2), (166, 1), (167, 1), (168, 4), (169, 1), (170, 1), (171, 2), (172, 1), (173, 1), (174, 13), (175, 3), (176, 1), (177, 1), (178, 3), (179, 2), (180, 1), (181, 2), (182, 1), (183, 1), (184, 1), (185, 1), (186, 1), (187, 2), (188, 1), (189, 5), (190, 2), (191, 1), (192, 1), (193, 1), (194, 1), (195, 2), (196, 2), (197, 1), (198, 6), (199, 1), (200, 1), (201, 1), (202, 1), (203, 1), (204, 1), (205, 4), (206, 3), (207, 1), (208, 1), (209, 3), (210, 3), (211, 1), (212, 2), (213, 2), (214, 3), (215, 1), (216, 1), (217, 2), (218, 1), (219, 2), (220, 1), (221, 1)]]
# Training a base model with minimum topic number = 2
lda_model = ldamodel.LdaModel(corpora, num_topics=2, id2word = dictionary, passes=50, random_state = 123)
# display top 10 keywords from each cluster
from pprint import pprint
pprint(lda_model.print_topics())
[(0, '0.030*"space" + 0.019*"bezos" + 0.016*"blue" + 0.015*"origin" + ' '0.015*"flight" + 0.011*"new" + 0.010*"company" + 0.010*"rocket" + ' '0.009*"virgin" + 0.008*"launch"'), (1, '0.012*"space" + 0.009*"nasa" + 0.008*"said" + 0.006*"year" + 0.005*"musk" + ' '0.005*"u" + 0.004*"also" + 0.004*"spacex" + 0.004*"one" + 0.004*"time"')]
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model = lda_model, texts = text_data, dictionary = dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
Coherence Score: 0.47299034340904567
# create a function to calculate the coherence score for a range of topic numbers
def get_coherence_scores(dictionary, corpus, texts, limit, start=2):
"""
dictionary : Gensim dictionary
corpus : Gensim corpus
texts : text data to be trained on
start: minimum number of topics, default to 2
limit : maximum number of topics
coherence_scores : a list of coherence scores based on different number of topics
"""
coherence_scores = []
for num_topics in range(start, limit):
model = ldamodel.LdaModel(corpus = corpus, num_topics=num_topics, id2word = dictionary, passes=50, random_state = 123)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_scores.append(coherencemodel.get_coherence())
return coherence_scores
# Tune the LDA model with range of topic numbers from 2 - 20
coherence_scores = get_coherence_scores(dictionary, corpora, text_data, limit = 21)
coherence_scores
[0.47299034340904567, 0.4446146275870011, 0.4407799676290295, 0.45401070060512005, 0.4786130992721418, 0.4787815684122995, 0.4479512349300888, 0.46243568556055564, 0.46387567353142056, 0.47761917748143606, 0.44352250878287375, 0.4860053766978051, 0.46407496151826055, 0.4979321165596749, 0.46591274717441233, 0.46488375795580994, 0.4563430098641939, 0.46268268118547096, 0.45482538985038634]
# Show graph
limit=21; start=2;
x = range(start, limit)
plt.plot(x, coherence_scores)
plt.xticks(range(1, 21))
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
# create a function to calculate the coherence score for different number of passes
# parameter values for passes
passes = [10, 50, 100, 200, 250]
def get_coherence_scores_passes(dictionary, corpus, texts, num_topics, passes):
"""
dictionary : Gensim dictionary
corpus : Gensim corpus
texts : text data to be trained on
num_topics: number of topics
passes: how many times the algorithm passes over the whole corpus
coherence_scores_passes : a list of coherence scores based on different number of passes
"""
coherence_scores_passes = []
for num_passes in passes:
model = ldamodel.LdaModel(corpus = corpus, num_topics=num_topics, id2word = dictionary, passes=num_passes, random_state = 123)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_scores_passes.append(coherencemodel.get_coherence())
return coherence_scores_passes
# Tune for the optimal number of passes
coherence_scores_passes = get_coherence_scores_passes(dictionary, corpora, text_data, num_topics = 6, passes = passes)
coherence_scores_passes
[0.4759996199658567, 0.4786130992721418, 0.4818068713516772, 0.48636286722022026, 0.4854663959781143]
# Show graph
plt.plot(passes, coherence_scores_passes)
plt.xlabel("Passes")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
# train the best model with parameters that produce the highest coherence score
best_lda_model = ldamodel.LdaModel(corpora, num_topics=6, id2word = dictionary, passes=200, random_state = 123)
# visualize detail from the best model
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(best_lda_model, corpora, dictionary)
vis
/usr/local/lib/python3.7/dist-packages/pyLDAvis/_prepare.py:247: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only by='saliency', ascending=False).head(R).drop('saliency', 1)
The elbow method provides an improved coherence score, however, the visualization shows that there are overlapping words within multiple clusters. To have refined topic clusters for this data set, we further enhance the model with 4 topic clusters to avoid overlaps.
# train the best model with parameters that produce the highest coherence score
best_lda_model_2 = ldamodel.LdaModel(corpora, num_topics=4, id2word = dictionary, passes=200, random_state = 123)
# visualize detail from the best model
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(best_lda_model_2, corpora, dictionary)
vis
/usr/local/lib/python3.7/dist-packages/pyLDAvis/_prepare.py:247: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only by='saliency', ascending=False).head(R).drop('saliency', 1)