Basic NLP with Python and NLTK

Basic NLP with Python and NLTK

Bruni Francesco (@brunifrancesco)

Download the original iPython notebook @ https://github.com/brunifrancesco/nltk_base.git

https://github.com/brunifrancesco/nltk_base.git

Python

- Programming language - Multi-paradigm - Easy to learn - Suitable for multiple needs - Multiple implementations, a ton of useful libraries

Basic Pythonimport random a_number = 1 a_string = "Python rocks!" a_list = ["1", "2", "3"] a_dict = {"film":"Pulp fiction", "francesco": "Python"} print(a_dict.values()) a_dict_of_list = {"key":["Carlito's way","The godfather"], "francesco":1} print(len(a_dict_of_list["key"])) a_tuple = ("Goodfellas", "Kill Bill",) a_list.append(4)

Creating functionsdef super_function(number): return number * 2

def factorial(n): if n == 0: return 1 else: return n*factorial(n-1)

double = lambda item: item * 2 predicate = lambda item: item > 3

assert super_function(3) == 6 assert factorial(3) == 6 assert double(3) == 6 assert list(filter(predicate, [1,2,5,3])) == [5]

And much more- Object oriented paradigm --> classes, metaclasses etc. etc. - Functional programming paradigm --> partials, closures, high order functions etc. etc. - Scripting paradigm --> shell control, os related functions etc.. - Async ops support --> asyncio

Reading fileswith open("file", "r") as input: data = input.read()

import csv def read_csv(): with open('data.csv', 'r') as francesco: data = csv.reader(francesco, delimiter=';') for element in data: print(element[1]) read_csv()

Make data talkfrom collections import Counter import statistics

splitted_chunks = data.split() print("Data lenght: %s" %len(data)) print("Chunks numbers: %s" %len(splitted_chunks)) print("Unique chunks: %s" %len(set(splitted_chunks))) print("Avg lenght of chunks: %s" %statistics.mean(map(len, splitted_chunks))) print("Std dev lenght of chunks: %s" %statistics.pstdev(map(len, splitted_chunks))) print("Frequency distribution: %s" %

sorted(filter(lambda item: item[1] > 5, Counter(splitted_chunks).items()), key=lambda item: item[1]))

NLTK- tokenization - stemming - tagging - parsing - semantic reasoning - classification

Tokenizingfrom nltk import word_tokenize tokens = word_tokenize(data) from nltk.tokenize import TweetTokenizer tokenizer = TweetTokenizer(strip_handles=True) s1 = '@remy: This is waaaaayyyy too much for you!!!!!!' tw_tokens = tokenizer.tokenize(s1) print(tw_tokens)

Frequency distributionfrom nltk.book import FreqDist fdist1 = FreqDist(splitted_chunks) most_common = fdist1.most_common(50) fdist1.plot(50, cumulative=True) fdist1.plot(10) print("Max frequency key: %s" %fdist1.max()) print("Occurrencies of 'Parlamento': %s" %fdist1["Parlamento"]) print("Frequency of 'Parlamento': %s"%fdist1.freq('Parlamento'))

Cleaning datafrom nltk.corpus import stopwords

def remove_stopword(word): return word not in words

import string words = stopwords.words('italian') lowered_chunks = list(map(lambda item: item.lower(), splitted_chunks)) print("Chunks lenght %s" %len(lowered_chunks)) clean_chunks = list(filter(remove_stopword, splitted_chunks)) print("Cleaned chunks (without stopwords) lenght: %s" %len(clean_chunks)) clean_chunks = list(filter(lambda chunk: chunk not in string.punctuation, clean_chunks)) print("Cleaned chunks (without punctuation and stopwords) lenght: %s" %len(clean_chunks))

from nltk.book import FreqDist fdist1 = FreqDist(clean_chunks) most_common = fdist1.most_common(50)

Stemmingfrom nltk.stem.porter import * from nltk.stem.snowball import * stemmer = PorterStemmer()

stemmer.stem(“activities")

available_langs = SnowballStemmer.languages sn_stemmer = SnowballStemmer("spanish", ignore_stopwords=True) print(sn_stemmer.stem("ordenador"))

from nltk.stem.lancaster import * LancasterStemmer().stem("activities")

Custom ngrams finderdef find_and_analyze_ngrams(tagged_sent): chunker = RegexpParser(CHUNK_RULE) tree = chunker.parse(tagged_sent) for item in self.__leaves(tree): if not item == tagged_sent: probable_ngram = ' '.join(self.__stemmer.stem( word.lower()) for (word, pos) in item ) if self.__evaluate_polarity_ngram(probable_ngram): yield probable_ngram

Classifying datadef __get_elements_for_classification(self, lfeats, train_number, classifying=True): train_feats = [] test_feats = [] for label, feats in lfeats.iteritems(): if classifying: train_feats.extend([(feat, label) for feat in feats]) else: cutoff = train_number * len(feats)/10 train_feats.extend([(feat, label) for feat in feats[:cutoff]]) test_feats.extend([(feat, label) for feat in feats[cutoff:]])

nb_classifier = NaiveBayesClassifier.train(train_feats) return train_feats, test_feats, nb_classifier

Pointwise Mutual Information

PMI(X = x, Y = y) = log

p(X = x, Y = y)

p(X = x)p(Y = y)

Measure PMI - Read from csv - Preprocess data (tokenize, lower, remove stopwords, punctuation) - Find frequency distribution for unigrams - Find frequency distribution for bigrams - Compute PMI via implemented function - Let NLTK sort bigrams by PMI metric - Write result to CSV file

Read dataimport nltk from nltk.corpus import stopwords import string import random from itertools import chain import math import csv import time

def read_data(): """ Read data 'libe by line'""" with open('data.csv', 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: yield row

Preprocessdef preprocess(data): """ Preprocess data, filtering out stopwords, punctuation and lowering all splitted tokens :param data: the string data to be processed """ italian_stopwords = stopwords.words('italian') splitted_chunks = data.split() lowered_chunks = (item.lower() for item in splitted_chunks) chunks_without_punctuation = (chunk for chunk in lowered_chunks

if chunk not in string.punctuation) chunks_without_stopwords = (chunk for chunk in chunks_without_punctuation

if chunk not in italian_stopwords) return list(chunks_without_stopwords)

Find N-GramsFREQUENCY_TRESHOLD = 2

def find_bigrams(splitted_chunks): """ Find bigrams and filter them by frequency threshold :param splitted_chunks: a list of chunks """ bigrams = nltk.collocations.BigramCollocationFinder.from_words(splitted_chunks) bigrams.apply_freq_filter(FREQUENCY_TRESHOLD) return {bigram: freq for bigram, freq in bigrams.ngram_fd.items()}

def find_unigrams(splitted_chunks): """ Find unigrams and filter them by frequency threshold :param splitted_chunks: a list of chunks """ unigrams = nltk.FreqDist(splitted_chunks)

return {unigram: freq for unigram, freq in unigrams.items() if freq > FREQUENCY_TRESHOLD - 1}

Compute PMIdef pmi(word1, word2, unigram_freq, bigram_freq): """ Find PMI measure :param word1: the first word :param word2: the second word :param unigram_freq: the unigram frequency container :param bigram_freq: the bigram frequency container """ prob_word1 = unigram_freq[word1] / sum(unigram_freq.values()) prob_word2 = unigram_freq[word2] / sum(unigram_freq.values()) prob_word1_word2 = bigram_freq[(word1, word2)] / sum(bigram_freq.values()) a = prob_word1_word2/prob_word1*prob_word2 return round(math.log(a,2),2)

Write result to CSVdef write_data(result): """ Write result to CSV file :param result: the list to be written to csv file """ with open("result.csv", "a") as output: writer = csv.writer(output, delimiter='*') for row in result: writer.writerow(row)

Happy coding :)

Data & Analytics

Basic NLP with Python and NLTK