Python Text Tokenizer
import pandas as
pd
import spacy
from
nltk.stem import
PorterStemmer
from collections import Counter
import time
start_time = time
.time
()
porter =
PorterStemmer()
print
("Second Step - ---
%s seconds ---"
% (time
.time
() -
start_time))
nlp = spacy
.load
('
en_core_web_sm') #Take 1.18 seconds to load
print
("Second Step - ---
%s seconds ---"
% (time
.time
() -
start_time))
def
stem_sentences(sentence):
tokens = sentence
.split
()
stemmed_tokens = [porter
.stem
(token) for
token in tokens]
return ' '
.join
(stemmed_tokens)
complete_text = ('NSW bushfire warnings spread to Sydney, blaze close to houses at Turramurra. NSWs bushfire chaos has spread to Sydney, where an emergency warning has
been issued near a university and hospital on the
citys North Shore. The NSW Rural Fire Service (RFS) has issued emergency warnings for
a total of 15 blazes burning around the state. An emergency level bushfire is burning just three kilometres north of Macquarie University in South Turramurra. ')
doc = nlp
(complete_text)
# Remove stop words and punctuation symbols and spaces
all_words = [token
.text for
token in doc]
words =
[token
.text for
token in doc
if not
token.is_stop and not
token.is_punct and not
token.is_space]
#Calculate word frequency
word_freq = Counter
(words)
print
("Second Step - ---
%s seconds ---"
% (time
.time
() -
start_time))
#Convert to
dataframe
dataframe_word_freq = pd
.DataFrame
.from_dict
(word_freq, orient='index')
.reset_index
().rename
(columns={'index':'Term', 0:'Frequency'})
#Extract noun phrases
new_words = [chunk
.text for chunk in doc
.noun_chunks]
#print
(test)
#Remove words
which are already in extracted
#new_words = list
(filter
(lambda
w:
w not in
all_words, test))
#new_words = test
#Extract frequency for noun-phrases
new_words_freq = Counter
(new_words)
#Convert noun-phrase to
dataframe
dataframe_new_words_freq = pd
.DataFrame
.from_dict
(new_words_freq, orient='index')
.reset_index
().rename
(columns={'index':'Term', 0:'Frequency'})
#Remove noun-phrases
which are already in
word_freq
dataframe_new_words_freq = dataframe_new_words_freq
[~dataframe_new_words_freq
.Term
.isin
(dataframe_word_freq
.Term)]
#Merge Both
dataframes(words and noun phrases)
df_final = pd
.concat
([dataframe_word_freq
,dataframe_new_words_freq]
,ignore_index=True)
df_final
.sort_values
("Frequency"
,ascending=False, inplace= True)
df_final
['TermStem'] = df_final
['Term']
.apply
(stem_sentences)
print
(df_final
[:15])
#take around 1.3 seconds to run