10/28/2021

Chat bot

 bot_template = "BOT : {0}"

user_template = "USER : {0}"

# Define a function that responds to a user's message: respond
def respond(message):
# Concatenate the user's message to the end of a standard bot respone
bot_message = "I can hear you! You said: " + message
# Return the result
return bot_message

# Test function
print(respond("hello!"))

# Create templates
bot_template = "BOT : {0}"
user_template = "USER : {0}"

# Define a function that sends a message to the bot: send_message
def send_message(message):
# Print user_template including the user_message
print(user_template.format(message))
# Get the bot's response to the message
response = respond(message)
# Print the bot template including the bot's response.
print(bot_template.format(response))

# Send a message to the bot
send_message("hello")

# Define variables
name = "Greg"
weather = "cloudy"

# Define a dictionary with the predefined responses
responses = {
"what's your name?": "my name is {0}".format(name),
"what's today's weather?": "the weather is {0}".format(weather),
"default": "default message"
}

# Return the matching response if there is one, default otherwise
def respond(message):
# Check if the message is in the responses
if message in responses:
# Return the matching message
bot_message = responses[message]
else:
# Return the "default" message
bot_message = responses["default"]
return bot_message

# Import the random module
import random

name = "Greg"
weather = "cloudy"

# Define a dictionary containing a list of responses for each message
responses = {
"what's your name?": [
"my name is {0}".format(name),
"they call me {0}".format(name),
"I go by {0}".format(name)
],
"what's today's weather?": [
"the weather is {0}".format(weather),
"it's {0} today".format(weather)
],
"default": ["default message"]
}

# Use random.choice() to choose a matching response
def respond(message):
if message in responses:
bot_message = random.choice(responses[message])
else:
bot_message = random.choice(responses["default"])
return bot_message


#def respond(message):
# Check for a question mark
if message.endswith("?"):
# Return a random question
return random.choice(responses["question"])
# Return a random statement
return random.choice(responses["statement"])
Define match_rule()
def match_rule(rules, message):
responses, phrase = "default", None
# Iterate over the rules dictionary
for pattern, responses in rules.items():
# Create a match object
match = re.search(pattern, message)
if match is not None:
# Choose a random response
response = random.choice(responses)
if '{0}' in response:
phrase = match.group(1)
# Return the response and phrase
return response.format(phrase)

# Test match_rule
print(match_rule(rules, "do you remember your last birthday"))

# Define replace_pronouns()
def replace_pronouns(message):

message = message.lower()
if 'me' in message:
# Replace 'me' with 'you'
return re.sub('me', 'you', message)
if 'my' in message:
# Replace 'my' with 'your'
return re.sub('my', 'your', message)
if 'your' in message:
# Replace 'your' with 'my'
return re.sub('your', 'my', message)
if 'you' in message:
# Replace 'you' with 'me'
return re.sub('you','me', message)

return message

print(replace_pronouns("my last birthday"))
print(replace_pronouns("when you went to Florida"))
print(replace_pronouns("I had my own castle"))

# Define respond()
def respond(message):
# Call match_rule
response, phrase = match_rule(rules, message)
if '{0}' in response:
# Replace the pronouns in the phrase
phrase = replace_pronouns(phrase)
# Include the phrase in the response
response = response.format(phrase)
return response

# Send the messages
send_message("do you remember your last birthday")
send_message("do you think humans should be worried about AI")
send_message("I want a robot friend")
send_message("what if you could be anything you wanted")

# Define a dictionary of patterns
patterns = {}

# Iterate over the keywords dictionary
for intent, keys in keywords.items():
# Create regular expressions and compile them into pattern objects
patterns[intent] = re.compile('|'.join (keys))
# Print the patterns
print(patterns)
 
# Define a function to find the intent of a message
def match_intent(message):
matched_intent = None
for intent, pattern in patterns.items():
# Check if the pattern occurs in the message
if pattern.search(message):
matched_intent = intent
return matched_intent

# Define a respond function
def respond(message):
# Call the match_intent function
intent = match_intent(message)
# Fall back to the default response
key = "default"
if intent in responses:
key = intent
return responses[key]

# Send messages
send_message("hello!")
send_message("bye byeee")
send_message("thanks very much!")
 
# Define find_name()
def find_name(message):
name = None
# Create a pattern for checking if the keywords occur
name_keyword = re.compile('name|call')
# Create a pattern for finding capitalized words
name_pattern = re.compile('[A-Z]{1}[a-z]*')
if name_keyword.search(message):
# Get the matching words in the string
name_words = name_pattern.findall(message)
if len(name_words) > 0:
# Return the name if the keywords are present
name = ' '.join(name_words)
return name

# Define respond()
def respond(message):
# Find the name
name = find_name(message)
if name is None:
return "Hi there!"
else:
return "Hello, {0}!".format(name)

# Send messages
send_message("my name is David Copperfield")
send_message("call me Ishmael")
send_message("People call me Cassandra")
 
# Load the spacy model: nlp
nlp = spacy.load('en')

# Calculate the length of sentences
n_sentences = len(sentences)

# Calculate the dimensionality of nlp
embedding_dim = nlp.vocab.vectors_length

# Initialize the array with zeros: X
X = np.zeros((n_sentences, embedding_dim))

# Iterate over the sentences
for idx, sentence in enumerate(sentences):
# Pass each each sentence to the nlp object to create a document
doc = nlp(sentence)
# Save the document's .vector attribute to the corresponding row in X
X[idx, :] = doc.vector
 
# Import SVC
from sklearn.svm import SVC

# Create a support vector classifier
clf = SVC(C=1)

# Fit the classifier using the training data
clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_test)

# Count the number of correct predictions
n_correct = 0
for i in range(len(y_test)):
if y_pred[i] == y_test[i]:
n_correct += 1

print("Predicted {0} correctly out of {1} test examples".format(n_correct, len(y_test)))
 
 
# Define included_entities
include_entities = ['DATE', 'ORG', 'PERSON']

# Define extract_entities()
def extract_entities(message):
# Create a dict to hold the entities
ents = dict.fromkeys(include_entities)
# Create a spacy document
doc = nlp(message)
for ent in doc.ents:
if ent.label_ in include_entities:
# Save interesting entities
ents[ent.label_] = ent.text
return ents

print(extract_entities('friends called Mary who have worked at Google since 2010'))
print(extract_entities('people who graduated from MIT in 1999'))
 
# Create the document
doc = nlp("let's see that jacket in red and some blue jeans")

# Iterate over parents in parse tree until an item entity is found
def find_parent_item(word):
# Iterate over the word's ancestors
for parent in word.ancestors:
# Check for an "item" entity
if entity_type(parent) == "item":
return parent.text
return None

# For all color entities, find their parent item
def assign_colors(doc):
# Iterate over the document
for word in doc:
# Check for "color" entities
if entity_type(word) == "color":
# Find the parent
item = find_parent_item(word)
print("item: {0} has color : {1}".format(item, word))

# Assign the colors
assign_colors(doc)
 
 

10/20/2021

NLP


# Write a pattern to match sentence endings: sentence_endings
sentence_endings = r"[.?!]"

# Split my_string on sentence endings and print the result
print(re.split(sentence_endings, my_string))

# Find all capitalized words in my_string and print the result
capitalized_words = r"[A-Z]\w+"
print(re.findall(capitalized_words, my_string))

# Split my_string on spaces and print the result
spaces = r"\s+"
print(re.split(spaces, my_string))

# Find all digits in my_string and print the result
digits = r"\d+"
print(re.findall(digits, my_string))
 
# Find the script notation at the beginning of the fourth sentence and print it
pattern2 = r"[\w\s]+:"
print(re.match(pattern2, sentences[3]))
 
 
# Import necessary modules
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

# Split scene_one into sentences: sentences
sentences = sent_tokenize(scene_one)

# Use word_tokenize to tokenize the fourth sentence: tokenized_sent
tokenized_sent = word_tokenize(sentences[3])

# Make a set of unique tokens in the entire scene: unique_tokens
unique_tokens = set(word_tokenize(scene_one))

# Print the unique tokens result
print(unique_tokens)
# Search for the first occurrence of "coconuts" in scene_one: match
match = re.search("coconuts", scene_one)

# Print the start and end indexes of match
print(match.start(), match.end())
 
# Write a regular expression to search for anything in square brackets: pattern1
pattern1 = r"\[.*]"

# Use re.search to find the first text in square brackets
print(re.search(pattern1, scene_one))
 
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer
# Write a pattern that matches both mentions (@) and hashtags
pattern2 = r"([@|#]\w+)"
# Use the pattern on the last tweet in the tweets list
mentions_hashtags = regexp_tokenize(tweets[-1], pattern2)
print(mentions_hashtags)
 
# Use the TweetTokenizer to tokenize all tweets into one list
tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in tweets]
print(all_tokens)
 
 
# Tokenize and print all words in german_text
all_words = word_tokenize(german_text)
print(all_words)

# Tokenize and print only capital words
capital_words = r"[A-ZÜ]\w+"
print(regexp_tokenize(german_text, capital_words))

# Tokenize and print only emoji
emoji = "['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']"
print(regexp_tokenize(german_text, emoji))

# Split the script into lines: lines
lines = holy_grail.split('\n')
# Replace all script lines for speaker
pattern = "[A-Z]{2,}(\s)?(#\d)?([A-Z]{2,})?:"
lines = [re.sub(pattern, '', l) for l in lines]
# Tokenize each line: tokenized_lines
tokenized_lines = [regexp_tokenize(s, '\w+') for s in lines]
# Make a frequency list of lengths: line_num_words
line_num_words = [len(t_line) for t_line in tokenized_lines]
# Plot a histogram of the line lengths
plt.hist(line_num_words)
 
 
 
# Import Counter
from collections import Counter
# Tokenize the article: tokens
tokens = word_tokenize(article)
# Convert the tokens into lowercase: lower_tokens
lower_tokens = [t.lower() for t in tokens]
# Create a Counter with the lowercase tokens: bow_simple
bow_simple = Counter(lower_tokens)
# Print the 10 most common tokens
print(bow_simple.most_common(10))
 
# Save the fifth document: doc
doc = corpus[4]

Gensim bag-of-words

# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)
# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
print(dictionary.get(word_id), word_count)
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
total_word_count[word_id] += word_count
# Create a sorted list from the defaultdict: sorted_word_count
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True)
# Print the top 5 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:5]:
print(dictionary.get(word_id), word_count)

What is tf-idf?

tf*log(1/df)
frequency in the text file: tf
frequency in the document: df

You want to calculate the tf-idf weight for the word "computer", which appears five times in a document containing 100 words. Given a corpus containing 200 documents, with 20 documents mentioning the word "computer", tf-idf can be calculated by multiplying term frequency with inverse document frequency. Term frequency = percentage share of the word compared to all tokens in the document Inverse document frequency = logarithm of the total number of documents in a corpora divided by the number of documents containing the term

tf_idf = 5/100 * log (200/20)
# Create a new TfidfModel using the corpus: tfidf
tfidf = TfidfModel(corpus)
# Calculate the tfidf weights of doc: tfidf_weights
tfidf_weights = tfidf[doc]
# Print the first five weights
print(tfidf_weights[:5])
# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_weights, key=lambda w: w[1], reverse=True)
# Print the top 5 weighted words
for term_id, weight in sorted_tfidf_weights[:5]:
print(dictionary.get(term_id), weight)
 

NER with NLTK

# Tokenize the article into sentences: sentences
sentences = sent_tokenize(article)
# Tokenize each sentence into words: token_sentences
token_sentences = [word_tokenize(sent) for sent in sentences]
# Tag each tokenized sentence into parts of speech: pos_sentences
pos_sentences = [nltk.pos_tag(sent) for sent in token_sentences]
# Create the named entity chunks: chunked_sentences
chunked_sentences = nltk.ne_chunk_sents(pos_sentences,binary = True)
# Test for stems of the tree with 'NE' tags
for sent in chunked_sentences:
for chunk in sent:
if hasattr(chunk, "label") and chunk.label() == "NE":
print(chunk)

Charting practice

# Create the defaultdict: ner_categories
ner_categories = defaultdict(int)
# Create the nested for loop
for sent in chunked_sentences:
for chunk in sent:
if hasattr(chunk, 'label'):
ner_categories[chunk.label()] += 1
# Create a list from the dictionary keys for the chart labels: labels
labels = list(ner_categories.keys())
# Create a list of the values: values
values = [ner_categories.get(v) for v in labels]
# Create the pie chart
plt.pie(values, labels=labels, autopct='%1.1f%%', startangle=140)
# Display the chart
plt.show()
 

Comparing NLTK with spaCy NER

# Import spacy
import spacy
# Instantiate the English model: nlp
nlp = spacy.load('en', tagger=False, parser=False, matcher=False)
# Create a new document: doc
doc = nlp(article)
# Print all of the found entities and their labels
for ent in doc.ents:
print(ent.label_, ent.text)
 
Which are the extra categories that spacy uses compared to nltk in its named-entity recognition?
NORP, CARDINAL, MONEY, WORKOFART, LANGUAGE, EVENT
 

French NER with polyglot

# Create the list of tuples: entities
entities = [(ent.tag, ' '.join(ent)) for ent in txt.entities]

# Print entities
print(entities)
# Create a new text object using Polyglot's Text class: txt
txt = Text(article)
# Print each of the entities found
for ent in txt.entities:
print (ent)
# Print the type of ent
print(type(ent))

 

Spanish NER with polyglot

# Initialize the count variable: count
count = 0
# Iterate over all the entities
for ent in txt.entities:
# Check whether the entity contains 'Márquez' or 'Gabo'
if 'Márquez' in ent or 'Gabo' in ent:
# Increment count
count += 1
# Print count
print(count)
# Calculate the percentage of entities that refer to "Gabo": percentage
percentage = count / len(txt.entities)
print(percentage)

CountVectorizer for text classification

# Import the necessary modules
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Print the head of df
print(df.head())

# Create a series to store the labels: y
y = df.label

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size = 0.33, random_state = 53)

# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words = 'english')

# Transform the training data using only the 'text' column values: count_train
count_train = count_vectorizer.fit_transform(X_train)

# Transform the test data using only the 'text' column values: count_test
count_test = count_vectorizer.transform(X_test)

# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

 

TfidfVectorizer for text classification

 

# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)

# Transform the training data: tfidf_train
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data: tfidf_test
tfidf_test = tfidf_vectorizer.transform(X_test)

# Print the first 10 features
print(tfidf_vectorizer.get_feature_names()[:10])

# Print the first 5 vectors of the tfidf training data
print(tfidf_train.A[:5])
 

Inspecting the vectors

# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame( tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

# Print the head of count_df
print(count_df.head())

# Print the head of tfidf_df
print(tfidf_df.head())

# Calculate the difference in columns: difference
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)

# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))

Training and testing the "fake news" model with CountVectorizer

 

# Import the necessary modules
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels = ['FAKE', 'REAL'])
print(cm)

# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(tfidf_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels = ['FAKE', 'REAL'])
print(cm)

Improving your model

# Create the list of alphas: alphas
alphas = np.arange(0,1,0.1)

# Define train_and_predict()
def train_and_predict(alpha):
# Instantiate the classifier: nb_classifier
nb_classifier = MultinomialNB(alpha = alpha)
# Fit to the training data
nb_classifier.fit(tfidf_train, y_train)
# Predict the labels: pred
pred = nb_classifier.predict(tfidf_test)
# Compute accuracy: score
score = metrics.accuracy_score(y_test, pred)
return score

# Iterate over the alphas and print the corresponding score
for alpha in alphas:
print('Alpha: ', alpha)
score = train_and_predict(alpha)
print('Score: ', score)
 

Inspecting your model

# Get the class labels: class_labels
class_labels = nb_classifier.classes_

# Extract the features: feature_names
feature_names = tfidf_vectorizer.get_feature_names()

# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))

# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])