Source code for lib.keyWordsCluster_KMeansTFIDF

import createKeyWords as CKW
import nltk.cluster.util
import numpy as np
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from time import time
import ext.common_english_words as common_english_words
import ext.extend_stop_words as custom_stop_words
from sklearn.feature_extraction import text 
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

[docs]def keyWordsCluster_KMeansTFIDF(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): """ Uses `createKeyWords` to form clusters of words post TF IDF (optional). Args: log_directory (str): Location of the logs (Assumed to be arranged in directory structure as : <year>/<month>/<day>/<log-file-for-channel>.txt) channel_name (str): Channel to be perform analysis on output_directory (str): Location of output directory startingDate (int): Date to start the analysis (in conjunction with startingMonth) startingMonth (int): Date to start the analysis (in conjunction with startingDate) endingDate (int): Date to end the analysis (in conjunction with endingMonth) endingMonth (int): Date to end the analysis (in conjunction with endingDate) Returns: null """ do_SVD = False words_to_show_per_cluster = 10 elbow_method_for_finding_K = False '''NON ELBOW''' number_of_clusters = 11 #elbow for jan-2013 = '''ELBOW SETTINGS''' check_k_till = 20 ''' MANUALLY CREATING A MATRIX ''' # each user's normalised frequency stored in rows # all the keywords (unfiltered) # ''' # keyword_list = [] # user_list = [] # keyword_dict_list, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words = CKW.createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth) # for dictionary in user_keyword_freq_dict: # # print dictionary['keywords'] # keyword_list = list(set(keyword_list + [x[0] for x in dictionary['keywords']])) # user_list.append(dictionary['nick']) # # print "\n \n \n", "KEYWORDS_LIST", keyword_list # # print "\n \n \n", "USER_LIST", user_list # #GENERATE A MATRIX WITH USERS AS ROWS AND KEYWORDS AS COLUMNS # user_keyword_matrix = np.zeros(shape=(len(user_list), len(keyword_list))) # # user_keyword_matrix = [[0]*len(keyword_list) for _ in xrange(len(user_list))] # for dictionary in user_keyword_freq_dict: # # print dictionary['nick'], user_list.index(dictionary['nick']) # for word_tuple in dictionary['keywords']: # # print word_tuple, keyword_list.index(word_tuple[0]) # user_keyword_matrix[user_list.index(dictionary['nick'])][keyword_list.index(word_tuple[0])] += word_tuple[1] # print user_keyword_matrix # transformer = TfidfTransformer() # tfidf = transformer.fit_transform(user_keyword_matrix) # tfIDFMatrix = tfidf.toarray() # print np.nonzero(tfIDFMatrix) # # Each row is normalized to have unit euclidean norm. # # The weights of each feature computed by the fit method call are stored in a model attribute: # print "Weights of each feature", transformer.idf_ # for i in xrange(len(transformer.idf_)): # print keyword_list[i], transformer.idf_[i] # # ''' AUTO TFIDF FROM JUST SENTENCES ''' #http://scikit-learn.org/stable/auto_examples/text/document_clustering.html #BUILDING CORPUS keyword_dict_list, user_keyword_freq_dict, user_words_dict_list, nicks_for_stop_words = CKW.createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth) corpus = [] for user_words_dict in user_words_dict_list: # print "SENDER", user_words_dict['sender'] # print "WORDS", " ".join(user_words_dict['words']) corpus.append(" ".join(map(str,user_words_dict['words']))) print "No. of users", len(corpus) #TF_IDF stop_word_without_apostrophe=[] for words in common_english_words.words: stop_word_without_apostrophe.append(words.replace("'","")) stop_words_extended = text.ENGLISH_STOP_WORDS.union(common_english_words.words).union(nicks_for_stop_words).union(stop_word_without_apostrophe).union(custom_stop_words.words).union(custom_stop_words.slangs) vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=stop_words_extended, use_idf=True) print "Extracting features from the training dataset using TF-IDF" t0 = time() tf_idf = vectorizer.fit_transform(corpus) print("done in %fs" % (time() - t0)) print "n_samples: %d, n_features: %d \n" % tf_idf.shape # LSA if do_SVD: print("============USING SVD==========") print("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(100) #recommened value = 100 normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) tf_idf = lsa.fit_transform(tf_idf) print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) if not elbow_method_for_finding_K: # CLUSTERING km = KMeans(n_clusters=number_of_clusters, init='k-means++',random_state=3465 ,max_iter=100, n_init=8) print("Clustering sparse data with %s" % km) t0 = time() km.fit(tf_idf) print("done in %0.3fs" % (time() - t0)) print("Top terms per cluster:") if do_SVD: original_space_centroids = svd.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] np.set_printoptions(threshold=np.nan) terms = vectorizer.get_feature_names() for i in range(number_of_clusters): print("Cluster %d:" % i) for ind in order_centroids[i, :words_to_show_per_cluster]: print terms[ind]+"\t"+str(round(km.cluster_centers_[i][ind],2)) print "" else: print "============ELBOW METHOD =============" sum_squared_errors_list = [] avg_sum_squared_errors_list = [] for i in xrange(1,check_k_till+1): print "\n===>> K = ", i km = KMeans(n_clusters=i, init='k-means++', max_iter=100, n_init=8) t0 = time() km.fit(tf_idf) if do_SVD: original_space_centroids = svd.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] distance_matrix_all_combination = cdist(tf_idf, km.cluster_centers_, 'euclidean') # cIdx = np.argmin(distance_matrix_all_combination,axis=1) distance_from_nearest_centroid = np.min(distance_matrix_all_combination,axis=1) sum_squared_errors = sum(distance_from_nearest_centroid) avg_sum_squared_errors = sum_squared_errors/tf_idf.shape[0] print "Sum Squared Error =", sum_squared_errors print "Avg Sum Squared Error =", avg_sum_squared_errors sum_squared_errors_list.append(sum_squared_errors) avg_sum_squared_errors_list.append(avg_sum_squared_errors) print("Top terms per cluster:") terms = vectorizer.get_feature_names() for i in range(i): print("Cluster %d:" % i) for ind in order_centroids[i, :words_to_show_per_cluster]: print(' %s' % terms[ind]) print() plt.plot(range(1,check_k_till+1), sum_squared_errors_list, 'b*-') # ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12, # markeredgewidth=2, markeredgecolor='r', markerfacecolor='None') plt.grid(True) plt.xlabel('Number of clusters') plt.ylabel('Average sum of squares') plt.title('Elbow for KMeans clustering') plt.savefig(output_directory+'key-words/'+'elbow_KMeans.png') plt.show() #NOTE RANDOM OUTPUTS BECAUSE OF RANDOM INITIALISATION print "NOTE RANDOM OUTPUTS BECAUSE OF RANDOM INITIALISATION"