Source code for lib.degreeNodeNumberCSV

import os.path
import re
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pylab
import pygraphviz as pygraphviz
import os
import csv
import math
import numpy as np
from numpy.random import normal
from scipy.optimize import curve_fit
from scipy import stats
from sklearn.metrics import mean_squared_error
import ext.util

[docs]def degreeNodeNumberCSV(log_directory, channel_name, output_directory, startingDate, startingMonth, endingDate, endingMonth): """ creates two csv files having no. of nodes with a certain in and out-degree for number of nodes it interacted with, respectively. Also gives graphs for log(degree) vs log(no. of nodes) and tries to find it's equation by curve fitting Args: log_directory (str): Location of the logs (Assumed to be arranged in directory structure as : <year>/<month>/<day>/<log-file-for-channel>.txt) channel_name (str): Channel to be perform analysis on output_directory (str): Location of output directory startingDate (int): Date to start the analysis (in conjunction with startingMonth) startingMonth (int): Date to start the analysis (in conjunction with startingDate) endingDate (int): Date to end the analysis (in conjunction with endingMonth) endingMonth (int): Date to end the analysis (in conjunction with endingDate) Returns: null """ nodes_with_OUT_degree_per_day = [] nodes_with_IN_degree_per_day = [] nodes_with_TOTAL_degree_per_day = [] max_degree_possible = 1000 # output_dir_degree = output_directory+"degreeNode/" output_dir_degree = output_directory output_dir_degree_img = output_dir_degree + "individual-images/" output_file_out_degree = output_dir_degree + channel_name+"_out_degree"+str(startingMonth)+"-"+str(startingDate)+"_"+str(endingMonth)+"-"+str(endingDate)+".csv" output_file_in_degree = output_dir_degree + channel_name+"_in_degree"+str(startingMonth)+"-"+str(startingDate)+"_"+str(endingMonth)+"-"+str(endingDate)+".csv" output_file_total_degree = output_dir_degree + channel_name+"_total_degree"+str(startingMonth)+"-"+str(startingDate)+"_"+str(endingMonth)+"-"+str(endingDate)+".csv" # print "Creating a new output folder" # os.system("rm -rf "+output_dir_degree) # os.system("mkdir "+output_dir_degree) if not os.path.exists(os.path.dirname(output_dir_degree)): try: os.makedirs(os.path.dirname(output_dir_degree)) # os.system("rm -rf "+output_dir_degree_img) os.system("mkdir "+output_dir_degree_img) # os.system("rm "+output_file_out_degree) os.system("touch "+output_file_out_degree) # os.system("rm "+output_file_in_degree) os.system("touch "+output_file_in_degree) # os.system("rm "+output_file_total_degree) os.system("touch "+output_file_total_degree) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise for folderiterator in range(startingMonth, endingMonth + 1): temp1 = "0" if folderiterator < 10 else "" for fileiterator in range(startingDate if folderiterator == startingMonth else 1, endingDate + 1 if folderiterator == endingMonth else 32): temp2 = "0" if fileiterator < 10 else "" filePath=log_directory+temp1+str(folderiterator)+"/"+temp2+str(fileiterator)+"/"+channel_name+".txt" if not os.path.exists(filePath): if not((folderiterator==2 and (fileiterator ==29 or fileiterator ==30 or fileiterator ==31)) or ((folderiterator==4 or folderiterator==6 or folderiterator==9 or folderiterator==11) and fileiterator==31 )): print "[Error] Path "+filePath+" doesn't exist" continue with open(filePath) as f: content = f.readlines() #contents stores all the lines of the file channel_name nicks = [] #list of all the nicknames ''' Getting all the nicknames in a list nicks[] ''' for i in content: if(i[0] != '=' and "] <" in i and "> " in i): m = re.search(r"\<(.*?)\>", i) if m.group(0) not in nicks: nicks.append(m.group(0)) #used regex to get the string between <> and appended it to the nicks list for i in xrange(0,len(nicks)): nicks[i] = nicks[i][1:-1] #removed <> from the nicknames for i in xrange(0,len(nicks)): nicks[i]=ext.util.correctLastCharCR(nicks[i]) for line in content: if(line[0]=='=' and "changed the topic of" not in line): nick1=ext.util.correctLastCharCR(line[line.find("=")+1:line.find(" is")][3:]) nick2=ext.util.correctLastCharCR(line[line.find("wn as")+1:line.find("\n")][5:]) if nick1 not in nicks: nicks.append(nick1) if nick2 not in nicks: nicks.append(nick2) ''' Forming list of lists for avoiding nickname duplicacy ''' nick_same_list=[[] for i in range(len(nicks))] #list of list with each list having all the nicks for that particular person for line in content: if(line[0]=='=' and "changed the topic of" not in line): line1=line[line.find("=")+1:line.find(" is")][3:] line2=line[line.find("wn as")+1:line.find("\n")][5:] line1=ext.util.correctLastCharCR(line1) line2=ext.util.correctLastCharCR(line2) for i in range(5000): if line1 in nick_same_list[i] or line2 in nick_same_list[i]: nick_same_list[i].append(line1) nick_same_list[i].append(line2) break if not nick_same_list[i]: nick_same_list[i].append(line1) nick_same_list[i].append(line2) break ''' Making relation map between client ''' conversations=[[] for i in range(100)] #format of each list [num_messages,sender_nick,receiver_nick] for i in xrange(0,100): conversations[i].append(0) for line in content: flag_comma = 0 if(line[0] != '=' and "] <" in line and "> " in line): m = re.search(r"\<(.*?)\>", line) var = m.group(0)[1:-1] var = ext.util.correctLastCharCR(var) for d in range(len(nicks)): if var in nick_same_list[d]: nick_sender = nick_same_list[d][0] break else: nick_sender=var for i in nicks: rec_list=[e.strip() for e in line.split(':')] rec_list[1]=rec_list[1][rec_list[1].find(">")+1:len(rec_list[1])] rec_list[1]=rec_list[1][1:] if not rec_list[1]: break for x in xrange(0,len(rec_list)): if(rec_list[x]): rec_list[x] = ext.util.correctLastCharCR(rec_list[x]) for z in rec_list: if(z==i): if(var != i): for d in range(len(nicks)): if i in nick_same_list[d]: nick_receiver=nick_same_list[d][0] break else: nick_receiver=i for k in xrange(0,100): if (nick_sender in conversations[k] and nick_receiver in conversations[k]): if (nick_sender == conversations[k][1] and nick_receiver == conversations[k][2]): conversations[k][0]=conversations[k][0]+1 break if(len(conversations[k])==1): conversations[k].append(nick_sender) conversations[k].append(nick_receiver) conversations[k][0]=conversations[k][0]+1 break if "," in rec_list[1]: flag_comma = 1 rec_list_2=[e.strip() for e in rec_list[1].split(',')] for y in xrange(0,len(rec_list_2)): if(rec_list_2[y]): rec_list_2[y] = ext.util.correctLastCharCR(rec_list_2[y]) for j in rec_list_2: if(j==i): if(var != i): for d in range(len(nicks)): if i in nick_same_list[d]: nick_receiver=nick_same_list[d][0] break else: nick_receiver=i for k in xrange(0,100): if (nick_sender in conversations[k] and nick_receiver in conversations[k]): if (nick_sender == conversations[k][1] and nick_receiver == conversations[k][2]): conversations[k][0]=conversations[k][0]+1 break if(len(conversations[k])==1): conversations[k].append(nick_sender) conversations[k].append(nick_receiver) conversations[k][0]=conversations[k][0]+1 break if(flag_comma == 0): rec=line[line.find(">")+1:line.find(", ")][1:] rec = ext.util.correctLastCharCR(rec) if(rec==i): if(var != i): for d in range(len(nicks)): if i in nick_same_list[d]: nick_receiver=nick_same_list[d][0] break else: nick_receiver=i for k in xrange(0,100): if (nick_sender in conversations[k] and nick_receiver in conversations[k]): if (nick_sender == conversations[k][1] and nick_receiver == conversations[k][2]): conversations[k][0]=conversations[k][0]+1 break if(len(conversations[k])==1): conversations[k].append(nick_sender) conversations[k].append(nick_receiver) conversations[k][0]=conversations[k][0]+1 break msg_num_graph = nx.DiGraph() #graph with multiple directed edges between clients used for y in xrange(0,100): if(len(conversations[y])==3): msg_num_graph.add_edge(conversations[y][1],conversations[y][2],weight=conversations[y][0]) for u,v,d in msg_num_graph.edges(data=True): d['label'] = d.get('weight','') # output_file=out_dir_msg_num+channel_name+"_2013_"+str(folderiterator)+"_"+str(fileiterator)+"_msg_num.png" # print "Generated " + output_file # A = nx.drawing.nx_agraph.to_agraph(msg_num_graph) # A.layout(prog='dot') # A.draw(output_file) nodes_with_OUT_degree = [0]*max_degree_possible nodes_with_IN_degree = [0]*max_degree_possible nodes_with_TOTAL_degree = [0]*max_degree_possible # print msg_num_graph.out_degree(), msg_num_graph.in_degree(), msg_num_graph.degree() # print msg_num_graph.out_degree().values() # print msg_num_graph.in_degree().values() # print msg_num_graph.degree().values() for degree in msg_num_graph.out_degree().values(): nodes_with_OUT_degree[degree]+=1 for degree in msg_num_graph.in_degree().values(): nodes_with_IN_degree[degree]+=1 for degree in msg_num_graph.degree().values(): nodes_with_TOTAL_degree[degree]+=1 x_axis_log = [math.log(i) for i in xrange(1, 20)]#ignore degree 0 y_axis_log = [math.log(i) if i>0 else 0 for i in nodes_with_TOTAL_degree[1:20] ]#ignore degree 0 #plot1 plt.plot(x_axis_log, y_axis_log) #plot2 plt.plot([1,2], [1,2]) plt.xlabel("log(degree)") plt.ylabel("log(no_of_nodes)") plt.xticks(x_axis_log, ['log'+str(i) for i in xrange(1, len(x_axis_log))]) plt.yticks(x_axis_log, ['log'+str(i) for i in xrange(1, len(x_axis_log))]) plt.legend(['Required', 'y = x'], loc='upper left') # Save it in png and svg formats plt.savefig(output_dir_degree_img+"/total_out_degree"+str(folderiterator)+"-"+str(fileiterator)+".png") plt.close() # print "\n" nodes_with_OUT_degree.insert(0, sum(nodes_with_OUT_degree)) nodes_with_OUT_degree.insert(0, str(folderiterator)+"-"+str(fileiterator)) nodes_with_OUT_degree_per_day.append(nodes_with_OUT_degree) nodes_with_IN_degree.insert(0, sum(nodes_with_IN_degree)) nodes_with_IN_degree.insert(0, str(folderiterator)+"-"+str(fileiterator)) nodes_with_IN_degree_per_day.append(nodes_with_IN_degree) nodes_with_TOTAL_degree.insert(0, sum(nodes_with_TOTAL_degree)) nodes_with_TOTAL_degree.insert(0, str(folderiterator)+"-"+str(fileiterator)) nodes_with_TOTAL_degree_per_day.append(nodes_with_TOTAL_degree) # print nodes_with_OUT_degree_per_day # print nodes_with_IN_degree_per_day # print nodes_with_TOTAL_degree_per_day temp = ['deg'+str(i) for i in xrange(max_degree_possible)] temp.insert(0, 'total') temp.insert(0, 'out-degree/day>') nodes_with_OUT_degree_per_day.insert(0, temp) column_wise_OUT = zip(*nodes_with_OUT_degree_per_day) with open(output_file_out_degree, 'wb') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) for col in column_wise_OUT: wr.writerow(col) temp = ['deg'+str(i) for i in xrange(max_degree_possible)] temp.insert(0, 'total') temp.insert(0, 'in-degree/day>') nodes_with_IN_degree_per_day.insert(0, temp) column_wise_IN = zip(*nodes_with_IN_degree_per_day) with open(output_file_in_degree, 'wb') as myfile2: wr = csv.writer(myfile2, quoting=csv.QUOTE_ALL) for col in column_wise_IN: wr.writerow(col) temp = ['deg'+str(i) for i in xrange(max_degree_possible)] temp.insert(0, 'total') temp.insert(0, 'degree/day>') nodes_with_TOTAL_degree_per_day.insert(0, temp) column_wise_TOTAL = zip(*nodes_with_TOTAL_degree_per_day) with open(output_file_total_degree, 'wb') as myfile3: wr = csv.writer(myfile3, quoting=csv.QUOTE_ALL) for col in column_wise_TOTAL: wr.writerow(col) # generateFitGraphOverTime("TOTAL", 10, column_wise_TOTAL, channel_name, startingDate, startingMonth, endingDate, endingMonth, output_dir_degree) generateFitGraphOverTime("OUT", 9, column_wise_OUT, channel_name, startingDate, startingMonth, endingDate, endingMonth, output_dir_degree) generateFitGraphOverTime("IN", 9, column_wise_IN, channel_name, startingDate, startingMonth, endingDate, endingMonth, output_dir_degree)
'''-------------------------------helper function to gen graph-------------------''' def generateFitGraphOverTime(typeOfDegree, filter_val, column_wise, channel_name, startingDate, startingMonth, endingDate, endingMonth, output_dir_degree): sum_each_row = [] for row in column_wise[3:]: #ignore degree 0 and text, starting from degree 1 sum_each_row.append(sum(row[1:])) # print sum_each_row x_axis_log = [math.log(i) for i in xrange(1, filter_val)]#ignore degree 0 y_axis_log = [math.log(i) if i>0 else 0 for i in sum_each_row[1:filter_val] ]#ignore degree 0 # get x and y vectors x = np.array(x_axis_log) y = np.array(y_axis_log) '''WAY TWO OF REGRESSION''' slope, intercept, r_value, p_value, std_err = stats.linregress(x_axis_log,y_axis_log) line = [slope*xi+intercept for xi in x_axis_log] print str(typeOfDegree)+"\t"+str(slope)+"\t"+str(intercept)+"\t"+str(r_value**2)+"\t"+str(mean_squared_error(y, line)) # import plotly.plotly as py # py.sign_in('rohangoel963', 'vh6le8no26') # import plotly.graph_objs as go # trace1 = go.Scatter( # x=x, # y=y, # mode='lines', # marker=go.Marker(color='rgb(255, 127, 14)'), # name='Data' # ) # trace2 = go.Scatter( # x=x, # y=line, # mode='lines', # marker=go.Marker(color='rgb(31, 119, 180)'), # name='Fit' # ) # layout = go.Layout( # title='DegreeNode', # # plot_bgcolor='rgb(229, 229, 229)', # xaxis=go.XAxis(zerolinecolor='rgb(255,255,255)', gridcolor='rgb(255,255,255)'), # # yaxis=go.YAxis(zerolinecolor='rgb(255,255,255)', gridcolor='rgb(255,255,255)') # ) # data = [trace1, trace2] # fig = go.Figure(data=data, layout=layout) # py.image.save_as(fig, typeOfDegree+"temp.png") '''END''' #graph config axes = plt.gca() axes.set_xlim([0,3]) axes.set_ylim([0,6]) plt.xlabel("log(degree)") plt.ylabel("log(no_of_nodes)") # fit with np.polyfit m, b = np.polyfit(x, y, 1) plt.plot(x, y, '-') plt.plot(x, m*x + b, '-') plt.legend(['Data', 'Fit'], loc='upper right') plt.savefig(output_dir_degree+"/"+channel_name+"_"+typeOfDegree+"_graph_"+str(startingMonth)+"-"+str(startingDate)+"_"+str(endingMonth)+"-"+str(endingDate)+".png") plt.close() # print typeOfDegree, b, m # # Save it in png and svg formats # plt.savefig(output_dir_degree+"/"+channel_name+"_"+typeOfDegree+"_graph_"+str(startingMonth)+"-"+str(startingDate)+"_"+str(endingMonth)+"-"+str(endingDate)+".png") # plt.close() # print output_dir_degree +"/"+channel_name+"_"+typeOfDegree+"_graph_"+str(startingMonth)+"-"+str(startingDate)+"_"+str(endingMonth)+"-"+str(endingDate)+".png"