Source code for lib.createKeyWords

import os.path
import re
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pylab
import pygraphviz as pygraphviz
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 
import ext.common_english_words as common_english_words
import ext.extend_stop_words as custom_stop_words
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import ext.util

def correctNickFor_(inText):#last letter of nick maybe _ and this produces error in nickmatching
	if(inText and inText[len(inText)-1]=='_'):
		inText = inText[:-1]
	return inText

def dataForNick(data, nick, threshold, min_words_spoken): 
	keywords = None
	for dicts in data:
		if dicts['nick'] == nick:
			keywords = dicts['keywords']
			break
	total_freq = 0.0
	for freq_tuple in keywords:
		total_freq+=freq_tuple[1]
	selected_keywords = []
	selected_keywords_normal_freq = []
	if total_freq > min_words_spoken:
		if keywords:
			# selected_keywords = [keyword for keyword in keywords if keyword[2] >= threshold]
			# selected_keywords = [keyword[0].encode('ascii', 'ignore') for keyword in keywords if keyword[2] >= threshold]
			for keyword in keywords:
				if keyword[2] >= threshold:
						selected_keywords.append(keyword[0].encode('ascii', 'ignore'))
						selected_keywords_normal_freq.append(keyword[2])

			if len(selected_keywords) == 0:
				# print "No word's normalised score crosses the value of", threshold
				selected_keywords = None
		else:
			# print "No message sent by nick", nick
			pass
	else:
		# print "Not enough words spoken by", nick, "; spoke" ,int(total_freq), "words only, required", min_words_spoken
		pass
	return (selected_keywords, selected_keywords_normal_freq)

[docs]def createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): """ outputs the keywords for each user on a particular channel after normalising the frequency and removing the common stop words. Args: log_directory (str): Location of the logs (Assumed to be arranged in directory structure as : <year>/<month>/<day>/<log-file-for-channel>.txt) channel_name (str): Channel to be perform analysis on output_directory (str): Location of output directory startingDate (int): Date to start the analysis (in conjunction with startingMonth) startingMonth (int): Date to start the analysis (in conjunction with startingDate) endingDate (int): Date to end the analysis (in conjunction with endingMonth) endingMonth (int): Date to end the analysis (in conjunction with endingDate) Returns: null """ out_dir_nick_change = output_directory+"key-words/" user_words_dict = [] user_keyword_freq_dict = [] nick_same_list=[[] for i in range(5000)] #list of list with each list having all the nicks for that particular person keywords_filtered = [] no_messages = 0 # print "Creating a new output folder" # os.system("rm -rf "+out_dir_nick_change) # os.system("mkdir "+out_dir_nick_change) rem_time= None #remembers the time of the last message of the file parsed before the current file for folderiterator in range(startingMonth, endingMonth + 1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name # print "Analysing ",filePath nicks = [] #list of all the nicknames ''' Getting all the nicknames in a list nicks[] ''' for i in content: if(i[0] != '=' and "] <" in i and "> " in i): m = re.search(r"\<(.*?)\>", i) if m.group(0) not in nicks: nicks.append(m.group(0)) #used regex to get the string between <> and appended it to the nicks list for i in xrange(0,len(nicks)): nicks[i] = nicks[i][1:-1] #removed <> from the nicknames for i in xrange(0,len(nicks)): nicks[i]=ext.util.correctLastCharCR(nicks[i]) for line in content: if(line[0]=='=' and "changed the topic of" not in line): #excluding the condition when user changes the topic. Search for only nick changes nick1=ext.util.correctLastCharCR(line[line.find("=")+1:line.find(" is")][3:]) nick2=ext.util.correctLastCharCR(line[line.find("wn as")+1:line.find("\n")][5:]) if nick1 not in nicks: nicks.append(nick1) if nick2 not in nicks: nicks.append(nick2) #print("printing nicks***********************************") #print(nicks) ''' Forming list of lists for avoiding nickname duplicacy ''' for line in content: if(line[0]=='=' and "changed the topic of" not in line): line1=line[line.find("=")+1:line.find(" is")][3:] line2=line[line.find("wn as")+1:line.find("\n")][5:] line1=ext.util.correctLastCharCR(line1) line2=ext.util.correctLastCharCR(line2) for i in range(5000): if line1 in nick_same_list[i] or line2 in nick_same_list[i]: nick_same_list[i].append(line1) nick_same_list[i].append(line2) break if not nick_same_list[i]: nick_same_list[i].append(line1) nick_same_list[i].append(line2) break #print("printing nick_same_list****************************") #print(nick_same_list) for line in content: flag_comma = 0 if(line[0] != '=' and "] <" in line and "> " in line): m = re.search(r"\<(.*?)\>", line) var = m.group(0)[1:-1] var = ext.util.correctLastCharCR(var) for d in range(len(nicks)): if var in nick_same_list[d]: nick_sender = nick_same_list[d][0] break else: nick_sender = var nick_receiver='' for i in nicks: rec_list=[e.strip() for e in line.split(':')] #receiver list splited about : rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])] rec_list[1]=rec_list[1][1:] if not rec_list[1]: #index 0 will contain time 14:02 break for k in xrange(0,len(rec_list)): if(rec_list[k]): #checking for \ rec_list[k] = ext.util.correctLastCharCR(rec_list[k]) for z in rec_list: if(z==i): if(var != i): for d in range(len(nicks)): if i in nick_same_list[d]: nick_receiver=nick_same_list[d][0] break else: nick_receiver=i if "," in rec_list[1]: #receiver list may of the form <Dhruv> Rohan, Ram : flag_comma = 1 rec_list_2=[e.strip() for e in rec_list[1].split(',')] for y in xrange(0,len(rec_list_2)): if(rec_list_2[y]): #checking for \ rec_list_2[y]=ext.util.correctLastCharCR(rec_list_2[y]) for j in rec_list_2: if(j==i): if(var != i): for d in range(len(nicks)): if i in nick_same_list[d]: nick_receiver=nick_same_list[d][0] break else: nick_receiver=i if(flag_comma == 0): #receiver list can be <Dhruv> Rohan, Hi! rec=line[line.find(">")+1:line.find(", ")] rec=rec[1:] rec=ext.util.correctLastCharCR(rec) if(rec==i): if(var != i): for d in range(len(nicks)): if i in nick_same_list[d]: nick_receiver=nick_same_list[d][0] break else: nick_receiver=i #generating the words written by the sender message = rec_list[1:] no_messages += 1 correctedNickReciever = correctNickFor_(nick_receiver) if correctedNickReciever in message: message.remove(correctedNickReciever) # print nick_sender, "Message", ":".join(message), "end" lmtzr = WordNetLemmatizer() #limit word size = 3, drop numbers. word_list_temp = re.sub(r'\d+', '', " ".join(re.findall(r'\w{3,}', ":".join(message).replace(","," ")))).split(" ") word_list = [] #remove punctuations for word in word_list_temp: word = word.lower() word_list.append(word.replace("'","")) word_list_lemmatized = [] try: word_list_lemmatized = map(lmtzr.lemmatize, map(lambda x: lmtzr.lemmatize(x, 'v'), word_list)) except UnicodeDecodeError: pass # word_list_lemmatized = [ unicode(s) for s in word_list_lemmatized] # print "=====>original", word_list # print "===>lemmatized", word_list_lemmatized fr = 1 for dic in user_words_dict: if dic['sender'] == nick_sender: # print '1========',word_list_lemmatized dic['words'].extend(word_list_lemmatized) fr = 0 if fr: # print '2========',word_list_lemmatized user_words_dict.append({'sender':nick_sender, 'words':word_list_lemmatized }) nicks_for_stop_words = [] stop_word_without_apostrophe=[] for l in nick_same_list: nicks_for_stop_words.extend(l) for dictonary in user_words_dict: nicks_for_stop_words.append(dictonary['sender']) nicks_for_stop_words.extend([x.lower() for x in nicks_for_stop_words]) for words in common_english_words.words: stop_word_without_apostrophe.append(words.replace("'","")) stop_words_extended = text.ENGLISH_STOP_WORDS.union(common_english_words.words).union(nicks_for_stop_words).union(stop_word_without_apostrophe).union(custom_stop_words.words).union(custom_stop_words.slangs) count_vect = CountVectorizer(analyzer = 'word', stop_words=stop_words_extended, min_df = 1) for dictonary in user_words_dict: # print dictonary['sender'] # print dictonary['words'] try: matrix = count_vect.fit_transform(dictonary['words']) freqs = [[word, matrix.getcol(idx).sum()] for word, idx in count_vect.vocabulary_.items()] keywords = sorted(freqs, key = lambda x: -x[1]) # print 'Nick:', dictonary['sender'] total_freq = 0.0 for freq_tuple in keywords: total_freq+=freq_tuple[1] # print total_freq for freq_tuple in keywords: freq_tuple.append(round(freq_tuple[1]/float(total_freq),5)) user_keyword_freq_dict.append({'nick':dictonary['sender'], 'keywords': keywords }) # print 'Keywords: (Format : [<word>, <frequency>, <normalised_score>])' # print keywords # print "\n" except ValueError: pass # print user_keyword_freq_dict # print dataForNick(user_keyword_freq_dict, 'BluesKaj', 0.01) for data in user_keyword_freq_dict: keywords, normal_scores = dataForNick(user_keyword_freq_dict, data['nick'], 0.01, 100) # print "Nick:", data['nick'] # print "Keywords with normalised score > 0.01\n", keywords # print "Their Normal scores\n", normal_scores # print "\n" if keywords: keywords_filtered.append({'nick':data['nick'],'keywords': keywords}) # print "KEYWORDS!" # print keywords_filtered # print "DICT" # print user_keyword_freq_dict print str(startingMonth)+"\t"+str(no_messages)+"\t"+str(len(user_words_dict)) return keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words