Source code for lib.analysis.user

import networkx as nx
import re
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 
import ext.common_english_words as common_english_words
import ext.extend_stop_words as custom_stop_words
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from time import time
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import util

sys.path.append('../lib')
import config
import util
sys.path.append('../..')
import ext
import ext.common_english_words as common_english_words
import ext.extend_stop_words as custom_stop_words

[docs]def nick_change_graph(log_dict, DAY_BY_DAY_ANALYSIS = False): """ creates a graph which tracks the nick changes of the users where each edge has a time stamp denoting the time at which the nick was changed by the user Args: log_dict (str): Dictionary of logs created using reader.py Returns: list of the day_to_day nick changes if config.DAY_BY_DAY_ANALYSIS=True or else an aggregate nick change graph for the given time period. """ rem_time = None #remembers the time of the last message of the file parsed before the current file nick_change_day_list = [] aggregate_nick_change_graph = nx.MultiDiGraph() # graph for nick changes in the whole time span (not day to day) for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] today_nick_change_graph = nx.MultiDiGraph() #using networkx current_line_no = -1 for line in day_log: current_line_no = current_line_no + 1 if(line[0] == '=' and "changed the topic of" not in line): #excluding the condition when user changes the topic. Search for only nick changes nick1 = util.correctLastCharCR(line[line.find("=")+1:line.find(" is")][3:]) nick2 = util.correctLastCharCR(line[line.find("wn as")+1:line.find("\n")][5:]) earlier_line_no = current_line_no while earlier_line_no >= 0: #to find the line just before "=="" so as to find time of Nick Change earlier_line_no = earlier_line_no - 1 if(day_log[earlier_line_no][0] != '='): year, month, day = util.get_year_month_day(day_content) util.build_graphs(nick1, nick2, day_log[earlier_line_no][1:6], year, month, day, today_nick_change_graph, aggregate_nick_change_graph) break if(earlier_line_no == -1): today_nick_change_graph.add_edge(nick1, nick2, weight=rem_time) aggregate_nick_change_graph.add_edge(nick1, nick2, weight = rem_time) count = len(day_log) - 1 #setting up the rem_time for next file, by noting the last message sent on that file. while(count >= 0): if(day_log[count][0] != '='): rem_time = day_log[count][1:6] break count = count-1 nick_change_day_list.append(today_nick_change_graph) if DAY_BY_DAY_ANALYSIS: return nick_change_day_list else: return aggregate_nick_change_graph
[docs]def top_keywords_for_nick(user_keyword_freq_dict, nick, threshold, min_words_spoken): """ outputs top keywords for a particular nick Args: user_keyword_freq_dict(dict): dictionary for each user having keywords and their frequency nick(str) : user to do analysis on threshold(float): threshold on normalised values to seperate meaningful words min_words_spoken(int): threhold on the minumum number of words spoken by a user to perform analysis on Returns: null """ keywords = None for dicts in user_keyword_freq_dict: if dicts['nick'] == nick: keywords = dicts['keywords'] break total_freq = 0.0 for freq_tuple in keywords: total_freq += freq_tuple[1] top_keywords = [] top_keywords_normal_freq = [] if total_freq > min_words_spoken: if keywords: for keyword in keywords: if keyword[2] >= threshold: top_keywords.append(keyword[0].encode('ascii', 'ignore')) top_keywords_normal_freq.append(keyword[2]) if len(top_keywords) == 0: if config.DEBUGGER: print "No word's normalised score crosses the value of", threshold top_keywords = None else: if config.DEBUGGER: print "No message sent by nick", nick pass else: if config.DEBUGGER: print "Not enough words spoken by", nick, "; spoke" ,int(total_freq), "words only, required", min_words_spoken pass return (top_keywords, top_keywords_normal_freq)
[docs]def keywords(log_dict, nicks, nick_same_list): """ Returns keywods for all users Args: log_dict (str): Dictionary of logs data created using reader.py nicks(List) : list of nickname created using nickTracker.py nick_same_list :List of same_nick names created using nickTracker.py Returns keywords_filtered: filtered keywords for user user_keyword_freq_dict: dictionary for each user having keywords and their frequency user_words_dict: keywods for user nicks_for_stop_words: stop words """ user_words_dict = [] user_keyword_freq_dict = [] keywords_filtered = [] no_messages = 0 for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] for line in day_log: flag_comma = 0 if(util.check_if_msg_line (line)): m = re.search(r"\<(.*?)\>", line) var = util.correctLastCharCR((m.group(0)[1:-1])) for d in range(len(nicks)): if var in nick_same_list[d]: nick_sender = nick_same_list[d][0] break else: nick_sender = var nick_receiver = '' for i in nicks: rec_list = [e.strip() for e in line.split(':')] #receiver list splited about : util.rec_list_splice(rec_list) if not rec_list[1]: #index 0 will contain time 14:02 break rec_list = util.correct_last_char_list(rec_list) for z in rec_list: if(z == i): if(var != i): for d in range(len(nicks)): if i in nick_same_list[d]: nick_receiver = nick_same_list[d][0] break else: nick_receiver = i if "," in rec_list[1]: #receiver list may of the form <Dhruv> Rohan, Ram : flag_comma = 1 rec_list_2 = [e.strip() for e in rec_list[1].split(',')] rec_list_2 = util.correct_last_char_list(rec_list_2) for j in rec_list_2: if(j == i): if(var != i): for d in range(len(nicks)): if i in nick_same_list[d]: nick_receiver = nick_same_list[d][0] break else: nick_receiver = i if(flag_comma == 0): #receiver list can be <Dhruv> Rohan, Hi! rec = util.correctLastCharCR(line[line.find(">")+1:line.find(", ")][1:]) if(rec==i): if(var != i): for d in range(len(nicks)): if i in nick_same_list[d]: nick_receiver = nick_same_list[d][0] break else: nick_receiver = i #generating the words written by the sender message = rec_list[1:] no_messages += 1 correctedNickReciever = util.correct_nick_for_(nick_receiver) if correctedNickReciever in message: message.remove(correctedNickReciever) lmtzr = WordNetLemmatizer() #limit word size = 3, drop numbers. word_list_temp = re.sub(r'\d+', '', " ".join(re.findall(r'\w{3,}', ":".join(message).replace(","," ")))).split(" ") word_list = [] #remove punctuations for word in word_list_temp: word = word.lower() word_list.append(word.replace("'","")) word_list_lemmatized = [] try: word_list_lemmatized = map(lmtzr.lemmatize, map(lambda x: lmtzr.lemmatize(x, 'v'), word_list)) except UnicodeDecodeError: pass fr = 1 for dic in user_words_dict: if dic['sender'] == nick_sender: dic['words'].extend(word_list_lemmatized) fr = 0 if fr: user_words_dict.append({'sender':nick_sender, 'words':word_list_lemmatized }) nicks_for_stop_words = [] stop_word_without_apostrophe = [] for l in nick_same_list: nicks_for_stop_words.extend(l) for dictonary in user_words_dict: nicks_for_stop_words.append(dictonary['sender']) nicks_for_stop_words.extend([x.lower() for x in nicks_for_stop_words]) for words in common_english_words.words: stop_word_without_apostrophe.append(words.replace("'","")) stop_words_extended = text.ENGLISH_STOP_WORDS.union(common_english_words.words).union(nicks_for_stop_words).union(stop_word_without_apostrophe).union(custom_stop_words.words).union(custom_stop_words.slangs) count_vect = CountVectorizer(analyzer = 'word', stop_words=stop_words_extended, min_df = 1) for dictonary in user_words_dict: try: matrix = count_vect.fit_transform(dictonary['words']) freqs = [[word, matrix.getcol(idx).sum()] for word, idx in count_vect.vocabulary_.items()] keywords = sorted(freqs, key = lambda x: -x[1]) total_freq = 0.0 for freq_tuple in keywords: total_freq += freq_tuple[1] for freq_tuple in keywords: freq_tuple.append(round(freq_tuple[1]/float(total_freq), 5)) user_keyword_freq_dict.append({'nick':dictonary['sender'], 'keywords': keywords }) except ValueError: pass for data in user_keyword_freq_dict: keywords, normal_scores = top_keywords_for_nick(user_keyword_freq_dict, data['nick'], config.KEYWORDS_THRESHOLD, config.KEYWORDS_MIN_WORDS) if config.DEBUGGER: print "Nick:", data['nick'] print "Keywords with normalised score > 0.01\n", keywords print "Their Normal scores\n", normal_scores print "\n" if keywords: keywords_filtered.append({'nick': data['nick'], 'keywords': keywords}) return keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words
[docs]def keywords_clusters(log_dict, nicks, nick_same_list): """ Uses `keywords` to form clusters of words post TF IDF (optional). Args: log_dict (str): Dictionary of logs data created using reader.py nicks(List) : list of nickname created using nickTracker.py nick_same_list :List of same_nick names created using nickTracker.py Returns null """ ''' AUTO TFIDF FROM JUST SENTENCES ''' #http://scikit-learn.org/stable/auto_examples/text/document_clustering.html #BUILDING CORPUS keyword_dict_list, user_keyword_freq_dict, user_words_dict_list, nicks_for_stop_words = keywords(log_dict, nicks, nick_same_list) corpus = [] for user_words_dict in user_words_dict_list: corpus.append(" ".join(map(str,user_words_dict['words']))) print "No. of users", len(corpus) #TF_IDF stop_word_without_apostrophe = [] for words in common_english_words.words: stop_word_without_apostrophe.append(words.replace("'","")) stop_words_extended = text.ENGLISH_STOP_WORDS.union(common_english_words.words).union(nicks_for_stop_words).union(stop_word_without_apostrophe).union(custom_stop_words.words).union(custom_stop_words.slangs) vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=stop_words_extended, use_idf=True) print "Extracting features from the training dataset using TF-IDF" t0 = time() tf_idf = vectorizer.fit_transform(corpus) print("done in %fs" % (time() - t0)) print "n_samples: %d, n_features: %d \n" % tf_idf.shape # LSA if config.ENABLE_SVD: print("============USING SVD==========") print("Performing dimensionality reduction using LSA") t0 = time() # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. svd = TruncatedSVD(100) #recommened value = 100 normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) tf_idf = lsa.fit_transform(tf_idf) print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) if not config.ENABLE_ELBOW_METHOD_FOR_K: # CLUSTERING km = KMeans(n_clusters=config.NUMBER_OF_CLUSTERS, init='k-means++', random_state=3465, max_iter=100, n_init=8) print("Clustering sparse data with %s" % km) t0 = time() km.fit(tf_idf) print("done in %0.3fs" % (time() - t0)) print("Top terms per cluster:") if config.ENABLE_SVD: original_space_centroids = svd.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] np.set_printoptions(threshold=np.nan) terms = vectorizer.get_feature_names() for i in range(config.NUMBER_OF_CLUSTERS): print("Cluster %d:" % i) for ind in order_centroids[i, :config.SHOW_N_WORDS_PER_CLUSTER]: print terms[ind]+"\t"+str(round(km.cluster_centers_[i][ind], 2)) print "" else: print "============ELBOW METHOD =============" sum_squared_errors_list = [] avg_sum_squared_errors_list = [] for i in xrange(1, config.CHECK_K_TILL + 1): print "\n===>> K = ", i km = KMeans(n_clusters=i, init='k-means++', max_iter=100, n_init=8) t0 = time() km.fit(tf_idf) if config.ENABLE_SVD: original_space_centroids = svd.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] distance_matrix_all_combination = cdist(tf_idf, km.cluster_centers_, 'euclidean') # cIdx = np.argmin(distance_matrix_all_combination,axis=1) distance_from_nearest_centroid = np.min(distance_matrix_all_combination, axis=1) sum_squared_errors = sum(distance_from_nearest_centroid) avg_sum_squared_errors = sum_squared_errors/tf_idf.shape[0] print "Sum Squared Error =", sum_squared_errors print "Avg Sum Squared Error =", avg_sum_squared_errors sum_squared_errors_list.append(sum_squared_errors) avg_sum_squared_errors_list.append(avg_sum_squared_errors) print("Top terms per cluster:") terms = vectorizer.get_feature_names() for i in range(i): print("Cluster %d:" % i) for ind in order_centroids[i, :config.SHOW_N_WORDS_PER_CLUSTER]: print(' %s' % terms[ind]) print() plt.plot(range(1, config.CHECK_K_TILL+1), sum_squared_errors_list, 'b*-') # ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12, # markeredgewidth=2, markeredgecolor='r', markerfacecolor='None') plt.grid(True) plt.xlabel('Number of clusters') plt.ylabel('Average sum of squares') plt.title('Elbow for KMeans clustering') plt.show() #NOTE RANDOM OUTPUTS BECAUSE OF RANDOM INITIALISATION print "NOTE RANDOM OUTPUTS BECAUSE OF RANDOM INITIALISATION"