-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNLP_I.py
223 lines (161 loc) · 6.4 KB
/
NLP_I.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 16 12:00:45 2022
@author: AJIT
"""
### natural-language-processing
## remove punctuation
from string import punctuation
text = text.translate(punctuation)
## convert words to lower or upper case
text = text.lower()
text = text.upper()
## tokenize text
# tokenize text into different words
from nltk.tokenize import word_tokenize
words = word_tokenize(text)
# tokenize text into different sentences
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text)
# tokenise text into different words - tokenizes emojis, hashtags and other social media elements properly
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
tokenizer.tokenize(text)
# tokenise words and return words that match the regex pattern
from nltk.tokenize import regexp_tokenize
regexp_tokenize(text, pattern)
## plot word frequencies and word lengths
from nltk import FreqDist
from collections import Counter
import seaborn as sns
import requests
# download text
url = "https://www.gutenberg.org/files/11/11-0.txt"
book = requests.get(url)
text = book.text
# plot word frequencies
def plot_word_frequency(words, top_n=10):
word_freq = FreqDist(words) # or word_freq = Counter(text)
words = [element[0] for element in word_freq.most_common(top_n)]
frequencies = [element[1] for element in word_freq.most_common(top_n)]
plot = sns.barplot(words, frequencies)
return plot
plot_word_frequency(word_tokenize(text))
def plot_word_length(words):
word_lengths = [len(word) for word in words]
plt.hist(word_lengths)
plot_word_length(word_tokenize(text))
## strip whitespace from words
words = [word.strip() for word in words]
## remove stopwords
from nltk.corpus import stopwords
stops = stopwords.words('english')
words = [word for word in words if word not in stops]
## spell correction
from spell_corrector import rectify
words = [rectify(word) for word in words]
## stemming and lemmatization
# stemming
# porter stemmer - works only on english words
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
words = [stemmer.stem(word) for word in words]
# snowball stemmer - works on english words (better than porter) as well as some foreign language words
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
words = [stemmer.stem(word) for word in words]
# lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]
## bag-of-words model
documents = ["This is document one.",
"This is document two. A document can contain multiple sentences!",
"This is the third document. :)"]
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(documents)
# print sparse matrix
print(bow_model)
# print full matrix
print(bow_model.toarray())
# convert matrix to dataframe
bow_model = pd.DataFrame(bow_model.toarray(), columns = vectorizer.get_feature_names())
## tf-idf model
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_model = vectorizer.fit_transform(documents)
# print sparse matrix
print(tfidf_model)
# print full matrix
print(tfidf_model.toarray())
# convert matrix to dataframe
tfidf_model = pd.DataFrame(tfidf_model.toarray(), columns = vectorizer.get_feature_names())
## text preprocess function
def clean_document(document, remove_punct=True, sentence_case="lower", remove_stops=True,
spell_correction=False, stem=True, min_word_length=0, char_filter = r"[^\w]"):
'''
input:
document : string
remove_punct : whether to remove all the punctuations from the document
sentence_case : change document to "lower" case, "upper" case, or keep "same" case as provided
remove_stops : whether to remove stopwords from document
spell_correction : whether to correct spelling of each word
stem : whether to stem each word
min_word_length : remove words shorter than min_word_length
char_filter : regex pattern - removes those characters from the text that match the pattern
output: clean document
'''
# remove all punctuations
if remove_punct:
from string import punctuation
document = document.translate(punctuation)
# convert words to lower case
if sentence_case == "lower":
document = document.lower()
elif sentence_case == "upper":
document = document.upper()
# tokenise words
from nltk.tokenize import word_tokenize
words = word_tokenize(document)
# strip whitespace from all words
words = [word.strip() for word in words]
# remove stopwords
if remove_stops:
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))
words = [word for word in words if word not in stops]
# spell correction
if spell_correction:
from spell_corrector import rectify
words = [rectify(word) for word in words]
# stemming
if stem:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
words = [stemmer.stem(word) for word in words]
# remove extremely short words
words = [word for word in words if len(word) > min_word_length]
# join back words to get document
document = " ".join(words)
# remove unwanted characters
import re
document = re.sub(char_filter, " ", document) # compile regex for quick processing
# replace multiple whitespaces with single whitespace
document = re.sub(r"\s+", " ", document)
# strip whitespace from document
document = document.strip()
return document
## edit-distance
from nltk.metrics.distance import edit_distance
edit_distance("hello", "hola", substitution_cost=1, transpositions=True)
## named-entity recognition (NER) using spacy
import spacy
nlp = spacy.load("en") # load english corpus
print(npl.entity)
doc = nlp("sample text") # NER
print(doc.ents) # look at entities
# print entities
for entity in doc.ents:
print(entity.text, entity.label_) # look at entities and their labels
==============================================================================================================================