Source code for lib.createKeyWords

import os.path
import re
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pylab
import pygraphviz as pygraphviz
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 
import ext.common_english_words as common_english_words
import ext.extend_stop_words as custom_stop_words
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import ext.util

def correctNickFor_(inText):#last letter of nick maybe _ and this produces error in nickmatching
	if(inText and inText[len(inText)-1]=='_'):
		inText = inText[:-1]
	return inText

def dataForNick(data, nick, threshold, min_words_spoken): 
	keywords = None
	for dicts in data:
		if dicts['nick'] == nick:
			keywords = dicts['keywords']
			break
	total_freq = 0.0
	for freq_tuple in keywords:
		total_freq+=freq_tuple[1]
	selected_keywords = []
	selected_keywords_normal_freq = []
	if total_freq > min_words_spoken:
		if keywords:
			# selected_keywords = [keyword for keyword in keywords if keyword[2] >= threshold]
			# selected_keywords = [keyword[0].encode('ascii', 'ignore') for keyword in keywords if keyword[2] >= threshold]
			for keyword in keywords:
				if keyword[2] >= threshold:
						selected_keywords.append(keyword[0].encode('ascii', 'ignore'))
						selected_keywords_normal_freq.append(keyword[2])

			if len(selected_keywords) == 0:
				# print "No word's normalised score crosses the value of", threshold
				selected_keywords = None
		else:
			# print "No message sent by nick", nick
			pass
	else:
		# print "Not enough words spoken by", nick, "; spoke" ,int(total_freq), "words only, required", min_words_spoken
		pass
	return (selected_keywords, selected_keywords_normal_freq)

[docs]def createKeyWords(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth):
	""" outputs the keywords for each user on a particular channel
	after normalising the frequency and removing the common stop words.

    Args:
        log_directory (str): Location of the logs (Assumed to be arranged in directory structure as : <year>/<month>/<day>/<log-file-for-channel>.txt)
        channel_name (str): Channel to be perform analysis on
        output_directory (str): Location of output directory
        startingDate (int): Date to start the analysis (in conjunction with startingMonth)
        startingMonth (int): Date to start the analysis (in conjunction with startingDate)
        endingDate (int): Date to end the analysis (in conjunction with endingMonth)
        endingMonth (int): Date to end the analysis (in conjunction with endingDate)

    Returns:
       null 

    """

	out_dir_nick_change = output_directory+"key-words/"
	user_words_dict = []
	user_keyword_freq_dict = []
	nick_same_list=[[] for i in range(5000)] #list of list with each list having all the nicks for that particular person
	keywords_filtered = []
	no_messages = 0

	# print "Creating a new output folder"
	# os.system("rm -rf "+out_dir_nick_change)
	# os.system("mkdir "+out_dir_nick_change)

	rem_time= None #remembers the time of the last message of the file parsed before the current file

	for folderiterator in range(startingMonth, endingMonth + 1):
		temp1 = "0" if folderiterator < 10 else ""
		for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
			temp2 = "0" if fileiterator < 10 else ""
			filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"   
			if not os.path.exists(filePath):
				if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): 
					print "[Error] Path "+filePath+" doesn't exist"
				continue 
			with open(filePath) as f:
							content = f.readlines() #contents stores all the lines of the file channel_name
			
			# print "Analysing ",filePath 
			
			nicks = [] #list of all the nicknames     
			'''
				Getting all the nicknames in a list nicks[]
			'''
			for i in content:
				if(i[0] != '=' and "] <" in i and "> " in i):
					m = re.search(r"\<(.*?)\>", i)
					if m.group(0) not in nicks:                       
						nicks.append(m.group(0))   #used regex to get the string between <> and appended it to the nicks list

			for i in xrange(0,len(nicks)):
				nicks[i] = nicks[i][1:-1]     #removed <> from the nicknames
					
			for i in xrange(0,len(nicks)):
				nicks[i]=ext.util.correctLastCharCR(nicks[i])

			for line in content:
				if(line[0]=='=' and "changed the topic of" not in line): #excluding the condition when user changes the topic. Search for only nick changes
					nick1=ext.util.correctLastCharCR(line[line.find("=")+1:line.find(" is")][3:])
					nick2=ext.util.correctLastCharCR(line[line.find("wn as")+1:line.find("\n")][5:])
					if nick1 not in nicks:
						nicks.append(nick1)
					if nick2 not in nicks:
						nicks.append(nick2)
				
			#print("printing nicks***********************************")
			#print(nicks)	
			'''
				Forming list of lists for avoiding nickname duplicacy
			'''
			
			for line in content:
				if(line[0]=='=' and "changed the topic of" not in line):
					line1=line[line.find("=")+1:line.find(" is")][3:]
					line2=line[line.find("wn as")+1:line.find("\n")][5:]
					line1=ext.util.correctLastCharCR(line1)
					line2=ext.util.correctLastCharCR(line2)
					for i in range(5000):
						if line1 in nick_same_list[i] or line2 in nick_same_list[i]:
							nick_same_list[i].append(line1)
							nick_same_list[i].append(line2)
							break
						if not nick_same_list[i]:
							nick_same_list[i].append(line1)
							nick_same_list[i].append(line2)
							break
			#print("printing nick_same_list****************************")
			#print(nick_same_list)     
			for line in content:
				flag_comma = 0
				if(line[0] != '=' and "] <" in line and "> " in line):
					m = re.search(r"\<(.*?)\>", line)
					var = m.group(0)[1:-1]
					var = ext.util.correctLastCharCR(var)
					for d in range(len(nicks)):
						if var in nick_same_list[d]:
							nick_sender = nick_same_list[d][0]
							break
						else:
							nick_sender = var
					
					nick_receiver=''
					for i in nicks:
						rec_list=[e.strip() for e in line.split(':')] #receiver list splited about :
						rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])]
						rec_list[1]=rec_list[1][1:]
						if not rec_list[1]: #index 0 will contain time 14:02
							break
						for k in xrange(0,len(rec_list)):
							if(rec_list[k]): #checking for \
								rec_list[k] = ext.util.correctLastCharCR(rec_list[k])
						for z in rec_list:
							if(z==i):
								if(var != i):  
									for d in range(len(nicks)):
										if i in nick_same_list[d]:
											nick_receiver=nick_same_list[d][0]
											break
										else:
											nick_receiver=i
				
						if "," in rec_list[1]: #receiver list may of the form <Dhruv> Rohan, Ram :
							flag_comma = 1
							rec_list_2=[e.strip() for e in rec_list[1].split(',')]
							for y in xrange(0,len(rec_list_2)):
								if(rec_list_2[y]): #checking for \
									rec_list_2[y]=ext.util.correctLastCharCR(rec_list_2[y])
							for j in rec_list_2:
								if(j==i):
									if(var != i):   
										for d in range(len(nicks)):
											if i in nick_same_list[d]:
												nick_receiver=nick_same_list[d][0]
												break
											else:
												nick_receiver=i  

						if(flag_comma == 0): #receiver list can be <Dhruv> Rohan, Hi!
							rec=line[line.find(">")+1:line.find(", ")] 
							rec=rec[1:]
							rec=ext.util.correctLastCharCR(rec)
							if(rec==i):
								if(var != i):
									for d in range(len(nicks)):
										if i in nick_same_list[d]:
											nick_receiver=nick_same_list[d][0]
											break
										else:
											nick_receiver=i
					
					#generating the words written by the sender
					message = rec_list[1:]
					no_messages += 1
					correctedNickReciever = correctNickFor_(nick_receiver)
					if correctedNickReciever in message:
						message.remove(correctedNickReciever)
					# print nick_sender, "Message", ":".join(message), "end"  

					lmtzr = WordNetLemmatizer()
					#limit word size = 3, drop numbers.
					word_list_temp = re.sub(r'\d+', '', " ".join(re.findall(r'\w{3,}', ":".join(message).replace(","," ")))).split(" ")
					word_list = []
					#remove punctuations
					for word in word_list_temp:
						word = word.lower()
						word_list.append(word.replace("'",""))
					word_list_lemmatized = []
					try:     
						word_list_lemmatized = map(lmtzr.lemmatize, map(lambda x: lmtzr.lemmatize(x, 'v'), word_list))
					except UnicodeDecodeError:
						pass
					# word_list_lemmatized = [ unicode(s) for s in word_list_lemmatized]
					# print "=====>original", word_list
					# print "===>lemmatized", word_list_lemmatized

					fr = 1
					for dic in user_words_dict:
						if dic['sender'] == nick_sender:
								# print '1========',word_list_lemmatized
								dic['words'].extend(word_list_lemmatized)
								fr = 0
					if fr:
						# print '2========',word_list_lemmatized
						user_words_dict.append({'sender':nick_sender, 'words':word_list_lemmatized }) 

	nicks_for_stop_words = []
	stop_word_without_apostrophe=[]

	for l in nick_same_list:
		nicks_for_stop_words.extend(l)

	for dictonary in user_words_dict:
		nicks_for_stop_words.append(dictonary['sender'])

	nicks_for_stop_words.extend([x.lower() for x in nicks_for_stop_words])

	for words in common_english_words.words:
		stop_word_without_apostrophe.append(words.replace("'",""))
		
	stop_words_extended = text.ENGLISH_STOP_WORDS.union(common_english_words.words).union(nicks_for_stop_words).union(stop_word_without_apostrophe).union(custom_stop_words.words).union(custom_stop_words.slangs)
	count_vect = CountVectorizer(analyzer = 'word', stop_words=stop_words_extended, min_df = 1)

	for dictonary in user_words_dict:
		# print dictonary['sender']
		# print dictonary['words']
		try:
				matrix = count_vect.fit_transform(dictonary['words'])
				freqs = [[word, matrix.getcol(idx).sum()] for word, idx in count_vect.vocabulary_.items()]
				keywords = sorted(freqs, key = lambda x: -x[1])
				# print 'Nick:', dictonary['sender']
				total_freq = 0.0
				for freq_tuple in keywords:
					total_freq+=freq_tuple[1]
				# print total_freq
				
				for freq_tuple in keywords:
					freq_tuple.append(round(freq_tuple[1]/float(total_freq),5))
				user_keyword_freq_dict.append({'nick':dictonary['sender'], 'keywords': keywords })

				# print 'Keywords: (Format : [<word>, <frequency>, <normalised_score>])'
				# print keywords
				# print "\n"
		except ValueError:
				pass
	
	# print user_keyword_freq_dict
	# print dataForNick(user_keyword_freq_dict, 'BluesKaj', 0.01)
	for data in user_keyword_freq_dict:
		keywords, normal_scores = dataForNick(user_keyword_freq_dict, data['nick'], 0.01, 100)
		# print "Nick:", data['nick']
		# print "Keywords with normalised score > 0.01\n", keywords
		# print "Their Normal scores\n", normal_scores
		# print "\n"
		if keywords:
			keywords_filtered.append({'nick':data['nick'],'keywords': keywords})
			
	# print "KEYWORDS!"
	# print keywords_filtered
	# print "DICT"
	# print user_keyword_freq_dict
	print str(startingMonth)+"\t"+str(no_messages)+"\t"+str(len(user_words_dict))
	return keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words