Source code for sussex_nltk.stats

'''
.. codeauthor::
    Matti Lyra
'''

import os
import random
import matplotlib.pyplot as plt
from numpy import average
try:
    import cPickle as pickle
except:
    import pickle

import nltk

import sussex_nltk


fh = open( os.path.join(sussex_nltk._sussex_root,'data','sentiword','sentiword.pickle') ,'r')
_sent_words = pickle.load(fh)
fh.close()
del fh

[docs]def expected_token_freq(tokens, word, _n_norm=5000):
    """Calculates the expected frequency of each item in *words* per *_n_norm* token.
    """
    stats = []
    for head,tail in zip(range(0, len(tokens), _n_norm), range(_n_norm, len(tokens) + _n_norm, _n_norm)):
        chunk = tokens[head:tail]
        stats.append( chunk.count(word) )
    
    return average( stats )

[docs]def expected_sentiment_tokens(tokens, _n_norm=500):
    """Calculates the expected number of sentiment bearing tokens per *_n_norm* tokens.
    """
    if len(tokens) < _n_norm:
        raise ValueError('Not enough data to calculate statistic, tokens must be longer than %i items.'%_n_norm)
    
    keys = _sent_words.keys()
    keys = [k.replace('_',' ') for k in keys]
    _stats = []
    for head,tail in zip(range(0, len(tokens), _n_norm), range(_n_norm, len(tokens) + _n_norm, _n_norm)):
        chunk = tokens[head:tail]
        vocab = set(chunk)
        intersection = vocab.intersection(keys)
        fd = nltk.probability.FreqDist(chunk)
        _stats.append( sum( [fd[w] for w in intersection] ) )
    
    return sum(_stats) / (len(_stats) + 0.0)

[docs]def prob_short_sents(sents):
    """Calculates the probability of a sentence of 2 or less tokens.
    """
    return len([sent for sent in sents if len(sent) < 3]) / float(len(sents))

[docs]def normalised_lexical_diversity(tokens, _n_norm = 500):
    """Calculates the average lexical diversity per *_n_norm* tokens.
    """
    if len(tokens) < _n_norm:
        raise ValueError('Not enough data to calculate statistic, tokens must be longer than %i items.'%_n_norm)
    
    _stats = []
    for head,tail in zip(range(0, len(tokens), _n_norm), range(_n_norm, len(tokens) + _n_norm, _n_norm)):
        _stats.append(_n_norm / (len(set(tokens[head:tail])) + 0.0)) 
    
    return sum(_stats) / (len(_stats) + 0.0)

[docs]def percentage(count, total):
    return 100 * count / (total + 0.0)

[docs]def sample_from_corpus(corpus,sample_size):
    n = corpus.enumerate_sents()
    sample_indices = set(random.sample(xrange(n),sample_size))
    return [sent for i,sent in enumerate(corpus.sents()) if i in sample_indices]

[docs]def zipf_dist(freqdist,num_of_ranks=50,show_values=True):
    '''
    Given a frequency distribution object, rank all types
    in order of frequency of occurrence (where rank 1 is most
    frequent word), and plot the ranks against the frequency
    of occurrence. If num_of_ranks=20, then 20 types will
    be plotted.
    If show_values = True, then display the bar values above them.
    '''
    x = range(1,num_of_ranks+1)                #x values are the ranks of types
    y = freqdist.values()[:num_of_ranks]       #y values are the frequencies of the ranked types
    plt.bar(x,y,color="#1AADA4")            #plot a bar graph of x and y
    plt.xlabel("Rank of types ordered by frequency of occurrence")
    plt.ylabel("Frequency of occurrence")   #set the label of the y axis
    plt.grid(True)                          #display grid on graph
    plt.xticks(range(1,num_of_ranks+1,2),range(1,num_of_ranks+1,2))  #set what values appears on the x axis
    plt.xlim([0,num_of_ranks+2])            #limit the display on the x axis
    if show_values:                            #if show_values is True, then show the y values on the bars
        for xi,yi in zip(x,y):
            plt.text(xi+0.25,yi+50,yi,verticalalignment="bottom",rotation=55,fontsize="small")
    plt.show()                              #display the graph
    print "Plot complete."
Navigation

Source code for sussex_nltk.stats

Quick search

Navigation