Source code for lib.ConvL_ConvRT

import numpy
import datetime
import time
import pandas as pd
import os.path
import re
import networkx as nx
from networkx.algorithms.components.connected import connected_components
import matplotlib.pyplot as plt
import pylab
import pygraphviz as pygraphviz
import sys
import csv
import ext.util

[docs]def findConvLength_ConvRefreshTime(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): """ Calculates the conversation length that is the length of time for which two users communicate i.e. if a message is not replied to within RT, then it is considered as a part of another conversation. This function also calculates the conversation refresh time. For a pair of users, this is the time when one conversation ends and another one starts. Args: log_directory (str): Location of the logs (Assumed to be arranged in directory structure as : <year>/<month>/<day>/<log-file-for-channel>.txt) channel_name (str): Channel to be perform analysis on output_directory (str): Location of output directory startingDate (int): Date to start the analysis (in conjunction with startingMonth) startingMonth (int): Date to start the analysis (in conjunction with startingDate) endingDate (int): Date to end the analysis (in conjunction with endingMonth) endingMonth (int): Date to end the analysis (in conjunction with endingDate) Returns: null """ nick_same_list=[[] for i in range(7000)] nicks = [] #list of all the nicknames conv = [] conv_diff = [] # out_dir_msg_num = output_directory+"CL/" out_dir_msg_num = output_directory if not os.path.exists(os.path.dirname(out_dir_msg_num)): try: os.makedirs(os.path.dirname(out_dir_msg_num)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise for folderiterator in range(startingMonth, endingMonth + 1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name send_time = [] #list of all the times a user sends a message to another user nicks_for_the_day = [] print(filePath) #code for getting all the nicknames in a list for i in content: if(i[0] != '=' and "] <" in i and "> " in i): m = re.search(r"\<(.*?)\>", i) if m.group(0) not in nicks_for_the_day: nicks_for_the_day.append(m.group(0)) #used regex to get the string between <> and appended it to the nicks list for i in xrange(0,len(nicks_for_the_day)): if nicks_for_the_day[i][1:-1] not in nicks: nicks.append(nicks_for_the_day[i][1:-1]) #removed <> from the nicknames for i in xrange(0,len(nicks)): if(len(nicks[i])!=0): nicks[i]=ext.util.correctLastCharCR(nicks[i]) for j in content: if(j[0]=='=' and "changed the topic of" not in j): line1=j[j.find("=")+1:j.find(" is")] line2=j[j.find("wn as")+1:j.find("\n")] line1=line1[3:] line2=line2[5:] if(len(line1)!=0): line1=ext.util.correctLastCharCR(line1) if(len(line2)!=0): line2=ext.util.correctLastCharCR(line2) if line1 not in nicks: nicks.append(line1) if line2 not in nicks: nicks.append(line2) #code for forming list of lists for avoiding nickname duplicacy for line in content: if(line[0]=='=' and "changed the topic of" not in line): line1=line[line.find("=")+1:line.find(" is")] line2=line[line.find("wn as")+1:line.find("\n")] line1=line1[3:] line2=line2[5:] if(len(line1)!=0): line1=ext.util.correctLastCharCR(line1) if(len(line2)!=0): line2=ext.util.correctLastCharCR(line2) for i in range(7000): if line1 in nick_same_list[i] or line2 in nick_same_list[i]: if line1 in nick_same_list[i] and line2 not in nick_same_list[i]: nick_same_list[i].append(line2) break if line2 in nick_same_list[i] and line1 not in nick_same_list[i]: nick_same_list[i].append(line1) break if line2 in nick_same_list[i] and line1 in nick_same_list[i]: break if not nick_same_list[i]: nick_same_list[i].append(line1) nick_same_list[i].append(line2) break for ni in nicks: for ind in range(7000): if ni in nick_same_list[ind]: break if not nick_same_list[ind]: nick_same_list[ind].append(ni) break G = ext.util.to_graph(nick_same_list) L = connected_components(G) for i in range(1,len(L)+1): L[i-1] = [i]+L[i-1] # We use connected components algorithm to group all those nick clusters that have atleast one nick common in their clusters. So e.g. #Cluster 1- nick1,nick2,nick3,nick4(some nicks of a user) #Cluster 2 -nick5,nick6,nick2,nick7. Then we would get - nick1,nick2,nick3,nick4,nick5,nick6,nick7 and we can safely assume they belong to the same user. conversations=[[] for i in range(10000)] #This might need to be incremented from 10000 if we have more users. Same logic as the above 7000 one. Applies to all the other codes too. graph_to_sir = [] ## I would advice on using a different data structure which does not have an upper bound like we do in arrays. graph_x_axis = [] graph_y_axis = [] graphx1 =[] graphy1 =[] graphx2 =[] graphy2 =[] dateadd=-1 #Variable used for response time calculation. Varies from 0-365. for folderiterator in range(startingMonth, endingMonth + 1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name dateadd=dateadd+1 send_time = [] #list of all the times a user sends a message to another user meanstd_list = [] totalmeanstd_list = [] x_axis = [] y_axis = [] real_y_axis = [] time_in_min = [[] for i in range(1000)] print(filePath) #code for making relation map between clients for line in content: flag_comma = 0 if(line[0] != '=' and "] <" in line and "> " in line): m = re.search(r"\<(.*?)\>", line) var = m.group(0)[1:-1] var=ext.util.correctLastCharCR(var) for d in range(len(nicks)): #E.g. if names are rohan1,rohan2,rohan3...,then var will store rohan1. if((d < len(L)) and (var in L[d])): nick_sender = L[d][0] break for i in nicks: rec_list=[e.strip() for e in line.split(':')] rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])] rec_list[1]=rec_list[1][1:] if not rec_list[1]: break for ik in xrange(0,len(rec_list)): if(rec_list[ik]): rec_list[ik]=ext.util.correctLastCharCR(rec_list[ik]) for z in rec_list: if(z==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if((d<len(L)) and (i in L[d])): nick_receiver=L[d][0] break for rt in xrange(0,10000): if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]): conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5])) # We add response times in conversations for every conversation break #between userA and userB. If they havent already conversed if(len(conversations[rt])==0): #before than add time at a new array index and later append to it. conversations[rt].append(nick_sender) conversations[rt].append(nick_receiver) conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5])) break if "," in rec_list[1]: flag_comma = 1 rec_list_2=[e.strip() for e in rec_list[1].split(',')] for ij in xrange(0,len(rec_list_2)): if(rec_list_2[ij]): rec_list_2[ij]=ext.util.correctLastCharCR(rec_list_2[ij]) for j in rec_list_2: if(j==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if((d<len(L)) and (i in L[d])): #Lines 212-255 consider all cases in which messages are addressed such as - nick1:nick2 or nick1,nick2, nick_receiver=L[d][0] #or nick1,nick2: break for rt in xrange(0,10000): if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]): conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5])) break if(len(conversations[rt])==0): conversations[rt].append(nick_sender) conversations[rt].append(nick_receiver) conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5])) break if(flag_comma == 0): rec=line[line.find(">")+1:line.find(", ")] rec=rec[1:] rec=ext.util.correctLastCharCR(rec) if(rec==i): send_time.append(line[1:6]) if(var != i): for d in range(len(nicks)): if ((d<len(L)) and (i in L[d])): nick_receiver=L[d][0] break for rt in xrange(0,10000): if (nick_sender in conversations[rt] and nick_receiver in conversations[rt]): conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5])) break if(len(conversations[rt])==0): conversations[rt].append(nick_sender) conversations[rt].append(nick_receiver) conversations[rt].append(24*60*dateadd + int(line[1:6][0:2])*60+int(line[1:6][3:5])) break #Lines 212-290 consider all cases in which messages are addressed as - (nick1:nick2 or nick1,nick2 or nick1,nick2:) and stores their response times in conversations. conversations[i] contains all the response times between userA and userB throughout an entire year. for ty in range(0,len(conversations)): #Lines 295-297 remove the first two elements from every conversations[i] as they are the UIDS of sender and receiver respectively(and not RTs) if(len(conversations[ty])!=0): # response times are calculated starting from index 2. So now we have all the response times in conversations. del conversations[ty][0:2] for fg in range(0,len(conversations)): if(len(conversations[fg])!=0): first=conversations[fg][0] for gh in range(1,len(conversations[fg])): if(conversations[fg][gh]-conversations[fg][gh-1]>9): conv.append(conversations[fg][gh-1]-first) #We are recording the conversation length in conv and CRT in conv_diff. Here 9 is the average response #time we have already found before(see parser-RT.py). For every channel this value differs and would have to be changed in the code. conv_diff.append(conversations[fg][gh]-conversations[fg][gh-1]) first=conversations[fg][gh] if(gh==(len(conversations[fg])-1)): conv.append(conversations[fg][gh]-first) break for op in range(0,max(conv)): graphx1.append(op) graphy1.append(conv.count(op)) for po in range(0,max(conv_diff)): graphx2.append(po) graphy2.append(conv_diff.count(po)) #To plot CDF we store the CL and CRT values and their number of occurences as shown above. row_cl = zip(graphx1,graphy1) filename1= out_dir_msg_num+channel_name+"_"+str(startingMonth)+"-"+str(startingDate)+"_"+str(endingMonth)+"-"+str(endingDate)+"_CL.csv" with open(filename1, 'a+') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) for row in row_cl: wr.writerow(row) row_crt = zip(graphx2,graphy2) filename2= out_dir_msg_num+channel_name+"_"+str(startingMonth)+"-"+str(startingDate)+"_"+str(endingMonth)+"-"+str(endingDate)+"_CRT.csv" with open(filename2, 'a+') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) for row in row_crt: wr.writerow(row)
#These values are then written to conv_length and conv_diff csv files. #The below commented out code is for finding the RT(the 9 value which we used above for finding CRT and CL.) #Refer to parser-RT.py. Of note is that for finding RT #we do not append conversations like we did above. Instead we append the time in the format (eg. 10:15) straight from the log file(value between [] when the message is sent). ''' for ing in range(0,100): if(len(conversations[ing])!=0): #These lines convert the time from 10:15 format to 615 seconds format. This is simpler for subtraction for ing1 in range(2,len(conversations[ing])): time_in_min[ing].append(int(conversations[ing][ing1][0:2])*60+int(conversations[ing][ing1][3:5])) for index in range(0,100): if(len(conversations[index])!=0): #These lines subtract the consecutive time values to get the response times for a conversation. for index1 in range(2,len(conversations[index])-1): conversations[index][index1]=(int(conversations[index][index1+1][0:2])*60+int(conversations[index][index1+1][3:5])) - (int(conversations[index][index1][0:2])*60+int(conversations[index][index1][3:5])) for index in range(0,100): #if there are only 3 elements in conversations[i] -uid1,uid2,time, then we make convert time to seconds format. if(len(conversations[index])!=0): if(len(conversations[index])==3): conversations[index][2] = int(conversations[index][2][0:2])*60+int(conversations[index][2][3:5]) else: del conversations[index][-1] #else we delete the last element from every conversations[i] since we dont need it after subtraction operation. #i.e we remove xi as x(i)-x(i-1) has already been recorded at i-1 index. print(conversations) for index in range(0,100): if(len(conversations[index])!=0): for index1 in range(2,len(conversations[index])): #we append all values after subtraction operation without the UIDs. Thats why second for totalmeanstd_list.append(conversations[index][index1]) # loop starts with 2. 0 and 1 index are UIDs. Values are appended to totalmean_std. if(len(totalmeanstd_list)!=0): for iy in range(0, max(totalmeanstd_list)+1): x_axis.append(iy) for ui in x_axis: y_axis.append(float(totalmeanstd_list.count(ui))/float(len(totalmeanstd_list))) real_y_axis.append(y_axis[0]) for ix in range(1, len(y_axis)): real_y_axis.append(float(real_y_axis[ix-1])+float(y_axis[ix])) ''' ''' data = {'Response time': x_axis, 'CDF': real_y_axis} df = pd.DataFrame(data, columns = ['Response time', 'CDF']) df.index = df['Response time'] del df['Response time'] df #Here we plot the response time and CDF using pandas library. axes = plt.gca() #axes.set_xlim([0,300]) axes.set_ylim([0,1.2]) df.plot(ax=axes) name = channel+"_"+str(fileiterator)+"_"+str(iterator)+"_2013_response_time_CDF.pdf" #plt.show() plt.savefig(name) plt.close() ''' ''' for hi in range(0,len(totalmeanstd_list)): graph_to_sir.append(totalmeanstd_list[hi]) totalmeanstd_list.append(numpy.mean(totalmeanstd_list)) #Here we are basically appending the mean and std values for RTs just for timepass totalmeanstd_list.append(numpy.mean(totalmeanstd_list)+2*numpy.std(totalmeanstd_list)) for index in range(0,100): if(len(conversations[index])!=0): for index1 in range(2,len(conversations[index])): #Again we are appending mean and std values for RTs of a conversation between two users. meanstd_list.append(conversations[index][index1]) #This time appending to conversations. conversations[index].append(numpy.mean(meanstd_list)) conversations[index].append(numpy.mean(meanstd_list)+(2*numpy.std(meanstd_list))) meanstd_list[:] = [] #Ignore the part below this. Its wrong. #____________________________________________________________________________________________________________________________________________ for fina in range(0,100): if(len(xarr[fina])!=0): calc = time_in_min[fina][0] + xarr[fina][len(xarr[fina])-1] for somet in range(0,len(time_in_min[fina])): if (time_in_min[fina][somet] > calc): subtr = time_in_min[fina][somet-1] - time_in_min[fina][0] xarr[fina].append(subtr) break else: subtr = time_in_min[fina][len(time_in_min[fina])-1] - time_in_min[fina][0] xarr[fina].append(subtr) break #print("Conversation RT Info") #print(xarr) #print("Total Response-Time") #print(totalmeanstd_list) #print("\n\n") #print("grpahs to graph_to_sir") #print(graph_to_sir) graph_to_sir.sort() #print(graph_to_sir) for ti in range(0,graph_to_sir[len(graph_to_sir)-1]+1): graph_y_axis.append(graph_to_sir.count(ti)) graph_x_axis.append(ti) #print(graph_y_axis) #print(graph_x_axis) #print(len(graph_y_axis)) #print(len(graph_x_axis)) rows = zip(graph_x_axis,graph_y_axis) #Storing the RT values and their frequencies in csv file. with open('/home/dhruvie/LOP/graphforsir2.csv', 'a+') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) for row in rows: wr.writerow(row) '''