Source code for sussex_nltk.util
'''
.. codeauthor::
Matti Lyra
'''
import random
import matplotlib.pyplot as plt
[docs]def lexical_diversity(text):
return len(text) / (len(set(text)) + 0.0)
[docs]def percentage(count, total):
return 100 * count / (total + 0.0)
[docs]def sample_from_corpus(corpus,sample_size):
n = corpus.enumerate_sents()
sample_indices = set(random.sample(xrange(n),sample_size))
return [sent for i,sent in enumerate(corpus.sents()) if i in sample_indices]
[docs]def zipf_dist(freqdist,num_of_ranks=50,show_values=True):
'''
Given a frequency distribution object, rank all types
in order of frequency of occurrence (where rank 1 is most
frequent word), and plot the ranks against the frequency
of occurrence. If num_of_ranks=20, then 20 types will
be plotted.
If show_values = True, then display the bar values above them.
'''
x = range(1,num_of_ranks+1) #x values are the ranks of types
y = freqdist.values()[:num_of_ranks] #y values are the frequencies of the ranked types
plt.bar(x,y,color="#1AADA4") #plot a bar graph of x and y
plt.xlabel("Rank of types ordered by frequency of occurrence")
plt.ylabel("Frequency of occurrence") #set the label of the y axis
plt.grid(True) #display grid on graph
plt.xticks(range(1,num_of_ranks+1,2),range(1,num_of_ranks+1,2)) #set what values appears on the x axis
plt.xlim([0,num_of_ranks+2]) #limit the display on the x axis
if show_values: #if show_values is True, then show the y values on the bars
for xi,yi in zip(x,y):
plt.text(xi+0.25,yi+50,yi,verticalalignment="bottom",rotation=55,fontsize="small")
plt.show() #display the graph
print "Plot complete."