Mandeep's All things Data Blog: 2019

Python Text Tokenizer

import pandas as pd
import spacy
from nltk.stem import PorterStemmer
from collections import Counter
import time

start_time = time.time()
porter = PorterStemmer()

print("Second Step - --- %s seconds ---" % (time.time() - start_time))

nlp = spacy.load('en_core_web_sm') #Take 1.18 seconds to load

print("Second Step - --- %s seconds ---" % (time.time() - start_time))

def stem_sentences(sentence):
tokens = sentence.split()
stemmed_tokens = [porter.stem(token) for token in tokens]
return ' '.join(stemmed_tokens)

complete_text = ('NSW bushfire warnings spread to Sydney, blaze close to houses at Turramurra. NSWs bushfire chaos has spread to Sydney, where an emergency warning has been issued near a university and hospital on the citys North Shore. The NSW Rural Fire Service (RFS) has issued emergency warnings for a total of 15 blazes burning around the state. An emergency level bushfire is burning just three kilometres north of Macquarie University in South Turramurra. ')

doc = nlp(complete_text)
# Remove stop words and punctuation symbols and spaces
all_words = [token.text for token in doc]
words = [token.text for token in doc
if not token.is_stop and not token.is_punct and not token.is_space]

#Calculate word frequency
word_freq = Counter(words)

print("Second Step - --- %s seconds ---" % (time.time() - start_time))

#Convert to dataframe
dataframe_word_freq = pd.DataFrame.from_dict(word_freq, orient='index').reset_index().rename(columns={'index':'Term', 0:'Frequency'})

#Extract noun phrases
new_words = [chunk.text for chunk in doc.noun_chunks]

#print(test)

#Remove words which are already in extracted
#new_words = list(filter(lambda w: w not in all_words, test))
#new_words = test

#Extract frequency for noun-phrases
new_words_freq = Counter(new_words)

#Convert noun-phrase to dataframe
dataframe_new_words_freq = pd.DataFrame.from_dict(new_words_freq, orient='index').reset_index().rename(columns={'index':'Term', 0:'Frequency'})

#Remove noun-phrases which are already in word_freq
dataframe_new_words_freq = dataframe_new_words_freq[~dataframe_new_words_freq.Term.isin(dataframe_word_freq.Term)]

#Merge Both dataframes(words and noun phrases)
df_final = pd.concat([dataframe_word_freq,dataframe_new_words_freq],ignore_index=True)

df_final.sort_values("Frequency",ascending=False, inplace= True)

df_final['TermStem'] = df_final['Term'].apply(stem_sentences)

print(df_final[:15])

#take around 1.3 seconds to run

Mandeep's All things Data Blog

Monday, November 11, 2019

Python Text Tokenizer

About Me

Blog Archive