NLTK Text Processing Examples: Tokenization to NER
Classified in Computers
Written on in
English with a size of 4.53 KB
NLTK Text Processing Examples
Experiment 4: Basic Text Preprocessing
This section demonstrates fundamental text preprocessing steps using NLTK, including tokenization, stop word removal, filtering for alphabetic tokens, and stemming.
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import words
text = "Random sampling is a method of choosing a sample of observations from a population to make assumptions about the population"
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
# 1. Lowercase and filter for alphabetic tokens
alpha_tokens = [token.lower() for token in tokens if token.isalpha()]
# 2. Filter against a dictionary of English words (optional but shown)
eng_words = set(words.words())
valid_tokens = [token for token in alpha_tokens if token in eng_words]
# 3. Remove stop words
filtered_tokens = [token for token in valid_tokens if token not in stop_words]
# 4. Stemming
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print('Original Text:', text)
print('\nTokenized Text:', tokens)
print('\nAlpha Text:', alpha_tokens)
print('\nFiltered Text (Valid & Non-Stop Words):', filtered_tokens)
print('\nStemmed Tokens:', stemmed_tokens)
Experiment 6: TF-IDF Vectorization
This example shows how to preprocess text data (lowercasing, tokenizing, stop word removal, stemming) and then apply TfidfVectorizer from Scikit-learn.
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
# Ensure necessary NLTK data is downloaded
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
text_data = ['tiger bites deer', 'deer bites grass', 'grass is green']
def preprocess_text(text):
text = text.lower()
words = nltk.word_tokenize(text)
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]
stemmer = PorterStemmer()
words = [stemmer.stem(word) for word in words]
# Join back into a single string for TF-IDF input
text = ' '.join(words)
return text
text_data_preprocessed = [preprocess_text(text) for text in text_data]
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(text_data_preprocessed)
features = tfidf.get_feature_names_out()
print('Preprocessed Data:', text_data_preprocessed)
print('\nTF-IDF Features:', features)
print('\nTF-IDF Matrix (Array Form):')
print(tfidf_matrix.toarray())
Experiment 7: Named Entity Recognition (NER)
Demonstration of using NLTK for Part-of-Speech (POS) tagging followed by Named Entity Chunking.
import nltk
# Ensure necessary NLTK data is downloaded
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)
text = "John works for Google in New York"
tokens = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokens)
entities = nltk.chunk.ne_chunk(tagged)
print('Text:', text)
print('\nEntities Found:')
for entity in entities:
if hasattr(entity, 'label'):
# Concatenate the word parts of the entity
entity_text = ''.join(c[0] for c in entity.leaves())
print(f'{entity.label()}: {entity_text}')
Experiment 8: Lemmatization and POS Tagging
This code snippet performs tokenization, stop word removal, lemmatization, and then applies POS tagging to the resulting tokens.
Steps:
- Tokenize the input line.
- Filter out common English stop words.
- Apply WordNetLemmatizer.
- Generate final Part-of-Speech tags.
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Note: Lemmatization often benefits from POS tags, but this example uses default lemmatization.
line = 'the qick brown fox jump over a lazy dog'
tokens = nltk.word_tokenize(line)
stop_words = set(stopwords.words('english'))
# 1. Filter stop words
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
# 2. Lemmatization
lem = WordNetLemmatizer()
lemmatized_tokens = [lem.lemmatize(token) for token in filtered_tokens]
# 3. POS Tagging
pos_tags = nltk.pos_tag(lemmatized_tokens)
print('Original Line:', line)
print('\nLemmatized Tokens:', lemmatized_tokens)
print('\nWord and POS Tag Pairs:')
for word, tag in pos_tags:
print(f'{word} : {tag}')