Source code for lib.AggregateGraph
import os.path
import re
import networkx as nx
from networkx.algorithms.components.connected import connected_components
import numpy as np
import matplotlib.pyplot as plt
import pylab
import pygraphviz as pygraphviz
import os
import sys
import ext.util
[docs]def createAggregateGraph(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth):
""" Creates a directed graph for a longer time frames
with each node representing an IRC user
and each directed edge has a weight which
mentions the number messages sent and recieved by that user
in the selected time frame.
Args:
log_directory (str): Location of the logs (Assumed to be arranged in directory structure as : <year>/<month>/<day>/<log-file-for-channel>.txt)
channel_name (str): Channel to be perform analysis on
output_directory (str): Location of output directory
startingDate (int): Date to start the analysis (in conjunction with startingMonth)
startingMonth (int): Date to start the analysis (in conjunction with startingDate)
endingDate (int): Date to end the analysis (in conjunction with endingMonth)
endingMonth (int): Date to end the analysis (in conjunction with endingDate)
Returns:
null
"""
MAX_EXPECTED_DIFF_NICKS = 5000
nick_same_list=[[] for i in range(MAX_EXPECTED_DIFF_NICKS)]
conversations=[[] for i in range(MAX_EXPECTED_DIFF_NICKS)]
for i in xrange(0,MAX_EXPECTED_DIFF_NICKS):
conversations[i].append(0)
nicks = [] #list of all the nicknames
aggregate_graph = nx.DiGraph() #graph with multiple directed edges between clients used
if not os.path.exists(os.path.dirname(output_directory)):
try:
os.makedirs(os.path.dirname(output_directory))
except OSError as exc: # Guard against race condition
if exc.errno != errno.EEXIST:
raise
for folderiterator in range(startingMonth, endingMonth+1):
temp1 = "0" if folderiterator < 10 else ""
for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
temp2 = "0" if fileiterator < 10 else ""
filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"
if not os.path.exists(filePath):
if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )):
print "[Error] Path "+filePath+" doesn't exist"
continue
with open(filePath) as f:
content = f.readlines() #contents stores all the lines of the file channel_name #contents stores all the lines of the file kubunutu-devel
nicks_for_the_day = []
print "Working on " + filePath
'''Getting all the nicknames in a list'''
for i in content:
if(i[0] != '=' and "] <" in i and "> " in i):
m = re.search(r"\<(.*?)\>", i)
if m.group(0) not in nicks_for_the_day:
nicks_for_the_day.append(m.group(0)) #used regex to get the string between <> and appended it to the nicks list
for i in xrange(0,len(nicks_for_the_day)):
if nicks_for_the_day[i][1:-1] not in nicks:
nicks.append(nicks_for_the_day[i][1:-1]) #removed <> from the nicknames
for i in xrange(0,len(nicks)):
nicks[i] = ext.util.correctLastCharCR(nicks[i])
for line in content:
if(line[0]=='=' and "changed the topic of" not in line):
nick1=line[line.find("=")+1:line.find(" is")]
nick2=line[line.find("wn as")+1:line.find("\n")]
nick1=nick1[3:]
nick2=nick2[5:]
nick1=ext.util.correctLastCharCR(nick1)
nick2=ext.util.correctLastCharCR(nick2)
if nick1 not in nicks:
nicks.append(nick1)
if nick2 not in nicks:
nicks.append(nick2)
for line in content:
if(line[0]=='=' and "changed the topic of" not in line):
line1=line[line.find("=")+1:line.find(" is")]
line2=line[line.find("wn as")+1:line.find("\n")]
line1=line1[3:]
line2=line2[5:]
line1=ext.util.correctLastCharCR(line1)
line2=ext.util.correctLastCharCR(line2)
for i in range(MAX_EXPECTED_DIFF_NICKS):
if line1 in nick_same_list[i] or line2 in nick_same_list[i]:
if line1 in nick_same_list[i] and line2 not in nick_same_list[i]:
nick_same_list[i].append(line2)
break
if line2 in nick_same_list[i] and line1 not in nick_same_list[i]:
nick_same_list[i].append(line1)
break
if line2 in nick_same_list[i] and line1 in nick_same_list[i]:
break
if not nick_same_list[i]:
nick_same_list[i].append(line1)
nick_same_list[i].append(line2)
break
for ni in nicks:
for ind in range(MAX_EXPECTED_DIFF_NICKS):
if ni in nick_same_list[ind]:
break
if not nick_same_list[ind]:
nick_same_list[ind].append(ni)
break
G = ext.util.to_graph(nick_same_list)
L = connected_components(G)
for i in range(1,len(L)+1):
L[i-1] = [str(i)]+L[i-1]
for folderiterator in range(startingMonth, endingMonth+1):
temp1 = "0" if folderiterator < 10 else ""
for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32):
temp2 = "0" if fileiterator < 10 else ""
filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt"
if not os.path.exists(filePath):
if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )):
print "[Error] Path "+filePath+" doesn't exist"
continue
with open(filePath) as f:
content = f.readlines() #contents stores all the lines of the file channel_name #contents stores all the lines of the file kubunutu-devel
print(filePath)
for line in content:
flag_comma = 0
if(line[0] != '=' and "] <" in line and "> " in line):
m=re.search(r"\<(.*?)\>", line)
var=m.group(0)[1:-1]
var=ext.util.correctLastCharCR(var)
for d in range(MAX_EXPECTED_DIFF_NICKS):
if ((d < len(L)) and (var in L[d])): #change nick_same_list to L because L is the main list of all users and nicks now
nick_sender = L[d][0]
break
for i in nicks:
rec_list=[e.strip() for e in line.split(':')]
rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])]
rec_list[1]=rec_list[1][1:]
if not rec_list[1]:
break
for k in xrange(0,len(rec_list)):
if(rec_list[k]):
rec_list[k]=ext.util.correctLastCharCR(rec_list[k])
for z in rec_list:
if(z==i):
if(var != i):
for d in range(MAX_EXPECTED_DIFF_NICKS):
if ((d<len(L)) and (i in L[d])):
nick_receiver=L[d][0]
break
for r in xrange(0,MAX_EXPECTED_DIFF_NICKS):
if (nick_sender in conversations[r] and nick_receiver in conversations[r]):
if (nick_sender == conversations[r][1] and nick_receiver == conversations[r][2]):
conversations[r][0]=conversations[r][0]+1
break
if(len(conversations[r])==1):
conversations[r].append(nick_sender)
conversations[r].append(nick_receiver)
conversations[r][0]=conversations[r][0]+1
break
if "," in rec_list[1]:
flag_comma = 1
rec_list_2=[e.strip() for e in rec_list[1].split(',')]
for ij in xrange(0,len(rec_list_2)): #changed variable from i to ij as i has been used above. We are in nested for loop. Same variables name will overlap.
if(rec_list_2[ij]):
rec_list_2[ij] = ext.util.correctLastCharCR(rec_list_2[ij])
for j in rec_list_2:
if(j==i):
if(var != i):
for d in range(MAX_EXPECTED_DIFF_NICKS):
if i in L[d]:
nick_receiver=L[d][0]
break
for r in xrange(0,MAX_EXPECTED_DIFF_NICKS):
if (nick_sender in conversations[r] and nick_receiver in conversations[r]):
if (nick_sender == conversations[r][1] and nick_receiver == conversations[r][2]):
conversations[r][0]=conversations[r][0]+1
break
if(len(conversations[r])==1):
conversations[r].append(nick_sender)
conversations[r].append(nick_receiver)
conversations[r][0]=conversations[r][0]+1
break
if(flag_comma == 0):
rec=line[line.find(">")+1:line.find(", ")]
rec=rec[1:]
rec = ext.util.correctLastCharCR(rec)
if(rec==i):
if(var != i):
for d in range(MAX_EXPECTED_DIFF_NICKS):
if i in L[d]:
nick_receiver=L[d][0]
break
for r in xrange(0,MAX_EXPECTED_DIFF_NICKS):
if (nick_sender in conversations[r] and nick_receiver in conversations[r]):
if (nick_sender == conversations[r][1] and nick_receiver == conversations[r][2]):
conversations[r][0]=conversations[r][0]+1
break
if(len(conversations[r])==1):
conversations[r].append(nick_sender)
conversations[r].append(nick_receiver)
conversations[r][0]=conversations[r][0]+1
break
for index in xrange(0,MAX_EXPECTED_DIFF_NICKS):
if(len(conversations[index])==3):
aggregate_graph.add_edge(conversations[index][1],conversations[index][2],weight=conversations[index][0])
# print("========> nicks")
# print(nicks)
# print("========> nick_same_list")
# print(nick_same_list)
# print("========> conversations")
# print(conversations)
for u,v,d in aggregate_graph.edges(data=True):
d['label'] = d.get('weight','')
output_file=output_directory+channel_name+"_2013_"+str(startingMonth)+"_"+str(endingMonth)+"_aggregategraph.png"
print "Generating "+output_file
print "Please wait ...."
A = nx.to_agraph(aggregate_graph)
A.layout(prog='dot')
A.draw(output_file)
print("Done Generating")