How to add custom hues using seaborn for multiple pointplots? - seaborn

test_results = pd.DataFrame({'Month Number': {0: 11, 1: 2},
'LSOA code': {0: 60, 1: 67},
'Actual Frequency': {0: 13, 1: 1},
'Linear Regression': {0: 3.326444, 1: 3.742185},
'Ridge Regression': {0: 3.326444, 1: 3.742185}})
fig, ax = plt.subplots(figsize=(15, 8))
sns.pointplot(x='Month Number', y='Actual Frequency', ci=False, color='Red', data=test_results)
sns.pointplot(x='Month Number', y='Linear Regression', ci=False, color='Black', data=test_results)
sns.pointplot(x='Month Number', y='Ridge Regression', ci=False, color='Green', data=test_results)
I want to diplay a point plots for actual frequency, linear and ridge regression with different colors and add hue to distinguish them.

You can melt your dataframe. Then for your pointplot, you only need to specify hue rather than creating three separate pointplots:
In[1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
test_results = pd.DataFrame({'Month Number': {0: 11, 1: 2},
'LSOA code': {0: 60, 1: 67},
'Actual Frequency': {0: 13, 1: 1},
'Linear Regression': {0: 3.326444, 1: 3.742185},
'Ridge Regression': {0: 3.326444, 1: 3.742185}})
test_results
Out[1]:
Month Number LSOA code Actual Frequency Linear Regression Ridge Regression
0 11 60 13 3.326444 3.326444
1 2 67 1 3.742185 3.742185
In[2]:
fig, ax = plt.subplots(figsize=(15, 8))
test_results = test_results.melt(id_vars='Month Number', value_vars=df.columns[-3:])
test_results
Out[2]:
Month Number variable value
0 11 Actual Frequency 13.000000
1 2 Actual Frequency 1.000000
2 11 Linear Regression 3.326444
3 2 Linear Regression 3.742185
4 11 Ridge Regression 3.326444
5 2 Ridge Regression 3.742185
In[3]:
sns.pointplot(x='Month Number', y='value', hue='variable', ci=False, color='Red', data=test_results)

Related

How can I plot different types of seaborn plots on different x ticks?

I want to have multiple types of seaborn plots using the same y axis but with different x coordinates (see image below).
I've tried doing this multiple different ways with specifying the X-axis coordinates differently but can't seem to get it to work.
Here is an example of almost working code
x=[1,2,3,3,3,4,4,5,5,6] # first violin
y=[4,4,5,5,5,5,6] # second violin
z=[5,5,6] # swarmplot over second violin
for data,label in [(x,'x'),(y,'y'),(z,'z')]:
for i in data:
c2v['value'].append(i)
c2v['key'].append(label)
data=pd.DataFrame(c2v)
data.head()
print(data.loc[data.key=='z'])
fig,ax=plt.subplots(1,figsize=(5,5),dpi=200)
ax = sns.violinplot(data=data.loc[data.key.isin(['x','y'])], x='key', y='value',palette=['honeydew','lightgreen'])
sns.swarmplot(x=['swarmplot']*len(data), y=data['value'], order=ax.get_xticklabels() + ['swarmplot'], ax=ax) #.loc[data.key=='z',:]
ax.set_xlabel('')
It produces the following image:
However, it is plotting all values associated with x/y/z instead of just z. When I slice the dataframe to only 'z' in the swarmplot as below, I get an error:
sns.swarmplot(x=['swarmplot']*len(data), y=data.loc[data.key=='z',:]['value'], order=ax.get_xticklabels() + ['swarmplot'], ax=ax)
KeyError: 'swarmplot'
Any suggestions?
To draw a second plot onto the same x-axis, you can use order= giving a list of existing tick labels, appending the new labels.
Here is an example:
import seaborn as sns
tips = sns.load_dataset('tips')
ax = sns.swarmplot(data=tips, x='day', y='total_bill')
sns.violinplot(x=['violin']*len(tips), y=tips['total_bill'], order=ax.get_xticklabels() + ['violin'], ax=ax)
ax.set_xlabel('')
The problem with the code in the new question, is that the x= and y= of the swarmplot need the same number of elements. It also seems the swarmplot resets the y limits, so I added some code to readjust those:
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
x = [1, 2, 3, 3, 3, 4, 4, 5, 5, 6] # first violin
y = [4, 4, 5, 5, 5, 5, 6] # second violin
z = [5, 5, 6] # swarmplot over second violin
data = pd.DataFrame({'value': np.concatenate([x, y, z]),
'key': ['x'] * len(x) + ['y'] * len(y) + ['z'] * len(z)})
fig, ax = plt.subplots(1, figsize=(5, 5))
ax = sns.violinplot(data=data.loc[data.key.isin(['x', 'y'])], x='key', y='value', palette=['honeydew', 'lightgreen'])
ymin1, ymax1 = ax.get_ylim()
swarm_data = data.loc[data.key == 'z', :]['value']
sns.swarmplot(x=['swarmplot'] * len(swarm_data), y=swarm_data, order=ax.get_xticklabels() + ['swarmplot'], ax=ax)
ymin2, ymax2 = ax.get_ylim()
ax.set_ylim(min(ymin1, ymin2), max(ymax1, ymax2))
ax.set_xlabel('')
ax.set_xticks(np.arange(3))
ax.set_xticklabels(['x', 'y', 'swarmplot'])
plt.show()
You can simplify things by directly using the data without creating a dataframe:
x = [1, 2, 3, 3, 3, 4, 4, 5, 5, 6] # first violin
y = [4, 4, 5, 5, 5, 5, 6] # second violin
z = [5, 5, 6] # swarmplot over second violin
fig, ax = plt.subplots(1, figsize=(5, 5))
ax = sns.violinplot(x=['x']*len(x) + ['y']*len(y), y=x + y, palette=['honeydew', 'lightgreen'])
ymin1, ymax1 = ax.get_ylim()
sns.swarmplot(x=['swarmplot'] * len(z), y=z, order=ax.get_xticklabels() + ['swarmplot'], ax=ax)
ymin2, ymax2 = ax.get_ylim()
ax.set_ylim(min(ymin1, ymin2), max(ymax1, ymax2))
ax.set_xticks(np.arange(3))
ax.set_xticklabels(['x', 'y', 'swarmplot'])
plt.show()

Efficient algorithm for planar 4-colour graph colouring with scoring

I am making a game where you have to colour an image using the 4-colour theorem. No adjacent regions can be the same colour. There are four colours, red green blue yellow, and you get 10 points for each red region, 6 for green, 3 for blue and 1 for yellow.
I want an algorithm to be able to work out the maximum score for any given image. I have the code for extracting a planar graph from the image, which gives, for each region, a list of it's neighbours.
So far I have done a brute force implementation which checks all possible colourings, however that grows like 4**n for n regions. One approach I can take is to try and optimise this search as much as possible.
Is there any faster way? I know for 2 colours there is a linear time algorithm however for game design reasons I will generally not generate images which can be coloured with 2 colours.
Thanks :)
Edit: as sascha requests here are some example python dicts, they keys are the region id's and the lists are the list of neighbours of that region
easy = {2: [4], 4: [2, 3, 14, 13], 3: [4], 14: [4], 13: [4]}
top score : 46 (I think)
(my python bruteforce 0.1s)
medium = {2: [4, 5, 6], 4: [2, 3], 3: [4, 18], 5: [2, 6], 6: [5, 2, 13, 18], 13: [6, 20, 21, 22], 18: [6, 3, 20, 22], 20: [18, 13], 22: [18, 13], 21: [13]}
top score : 77
(my python bruteforce 7.2s)
hard = {2: [5, 6, 9], 5: [2, 4], 4: [5, 23], 6: [2, 7, 10], 3: [8, 16], 8: [3, 7, 12], 7: [6, 8, 10, 11], 9: [2, 10], 10: [6, 9, 7, 13, 14, 15, 17, 18], 11: [7, 12, 13], 12: [8, 11, 15, 16, 19], 13: [10, 11, 15], 14: [10, 15], 15: [10, 13, 12, 14, 17, 19], 16: [3, 12, 25, 27], 17: [10, 15, 18], 18: [10, 17, 19, 20], 19: [15, 18, 12, 27], 20: [18, 22, 24, 26, 27, 25], 22: [20], 23: [4, 24, 26], 24: [23, 20], 25: [16, 20], 26: [23, 20], 27: [19, 20, 16]}
(my python bruteforce unknown)
Edit2:
So I finished the game, if you are interested you can check it out here.
For the sake of the game I realized I only needed a high score rather than the absolute top score (which is what the question asked for). So I implemented greedy colouring and ran it 10,000 times shuffling the graph each time and taking the best scoring result. On all small boards of less than 30 regions this produces the same result as the brute force methods however it's time complexity scales much better to larger boards. So it may not find the absolutely best solution it will always find a very good one.
Thanks so much to #SaiBot and #sascha for their help :)
Here some simplified Integer Programming approach using python.
The basic idea is to use the amazing capabilities of modern high-quality Mixed Integer Programming Software without implementing algorithms ourself. We just need to define the model (and maybe tune some thing)!
Keep in mind, that (Mixed-)Integer Programming is in general NP-hard and we assume those Heuristics are working for our problem here!
The code may look somewhat ugly as the used modelling-tool is quite low-level. The model itself is quite simple in it's structure.
Code (python3; numpy, scipy, cylp + CoinOR Cbc solver)
Here the prototype-like code, which is missing the extraction of the final-solution. As this is just a demo (you did not tag a language), this just shows it can be a viable approach.
from cylp.cy import CyClpSimplex
import itertools
import numpy as np
import scipy.sparse as sp
from timeit import default_timer as time
""" Instances """
# hard = {2: [4], 4: [2, 3, 14, 13], 3: [4], 14: [4], 13: [4]}
# hard = {2: [4, 5, 6], 4: [2, 3], 3: [4, 18], 5: [2, 6], 6: [5, 2, 13, 18], 13: [6, 20, 21, 22], 18: [6, 3, 20, 22], 20: [18, 13], 22: [18, 13], 21: [13]}
hard = {2: [5, 6, 9],
5: [2, 4],
4: [5, 23],
6: [2, 7, 10],
3: [8, 16],
8: [3, 7, 12],
7: [6, 8, 10, 11],
9: [2, 10],
10: [6, 9, 7, 13, 14, 15, 17, 18],
11: [7, 12, 13],
12: [8, 11, 15, 16, 19],
13: [10, 11, 15],
14: [10, 15],
15: [10, 13, 12, 14, 17, 19],
16: [3, 12, 25, 27],
17: [10, 15, 18],
18: [10, 17, 19, 20],
19: [15, 18, 12, 27],
20: [18, 22, 24, 26, 27, 25],
22: [20],
23: [4, 24, 26],
24: [23, 20],
25: [16, 20],
26: [23, 20],
27: [19, 20, 16]}
""" Preprocessing -> neighbor conflicts
(remove dupes after sorting <-> symmetry
Remark: for difficult use-cases one could try to think about special
characteristics of the graph, like (not necessarily for this problem)
chordal -> calc all max-cliques in P(olynomial-time) => pretty good convex-hull
Here: just forbid conflicting-pairs (in each color-dimension).
"""
START_T = time()
conflicts = []
for key, vals in hard.items():
for val in vals:
conflicts.append((key, val))
conflicts_np = np.array(conflicts)
conflicts_np = np.sort(conflicts, axis=1)
conflicts_np = np.unique(conflicts_np, axis=0)
""" Preprocessing -> map IDs to gapless range [0-N)
"""
unique = np.unique(conflicts)
old2new = {}
new2old = {}
counter = itertools.count()
N = unique.shape[0]
for i in unique:
new_id = next(counter)
old2new[i] = new_id
new2old[new_id] = i
conflicts_np = np.vectorize(old2new.get)(conflicts_np)
""" Sparse conflict matrix """
conflict_matrix = sp.coo_matrix((np.ones(conflicts_np.shape[0]*2),
(np.tile(np.arange(conflicts_np.shape[0]), 2),
conflicts_np.ravel(order='F'))), shape=(conflicts_np.shape[0], N*4))
I, J, V = sp.find(conflict_matrix)
""" Integer Programming """
model = CyClpSimplex()
# 4 colors -> 4 binary vars per element in N
x = model.addVariable('x', N*4, isInt=True)
# scoring: linear-objective
model.objective = -np.hstack((np.full(N, 10), np.full(N, 6), np.full(N, 3), np.full(N, 1)))
# sub-opt way of forcing binary-constraints (from ints)
# (this awkward usage is due to problem with cylp in the past)
model += sp.eye(N*4) * x >= np.zeros(N*4)
model += sp.eye(N*4) * x <= np.ones(N*4)
# conflicts in each color-dimensions
# sub-opt numpy/scipy usage
for ind, i in enumerate(range(4)):
if ind == 0:
model += conflict_matrix * x <= 1
else:
shifted_conflicts = sp.coo_matrix((V,(I,J+(ind*N))), shape=(conflict_matrix.shape[0], N*4))
model += shifted_conflicts * x <= 1
# force exactly one color per element
# sub-opt numpy/scipy usage
template = np.zeros(N*4)
template[0] = 1
template[N] = 1
template[2*N] = 1
template[3*N] = 1
all_color_dims = [sp.csc_matrix(np.roll(template, i).reshape(1,-1)) for i in range(N)]
model += sp.vstack(all_color_dims) *x == 1
cbcModel = model.getCbcModel() # Clp -> Cbc model / LP -> MIP
start_time = time()
status = cbcModel.solve()
end_time = time()
print(" CoinOR CBC used {:.{prec}f} secs".format(end_time - start_time, prec=3))
print(" Complete process used {:.{prec}f} secs".format(end_time - START_T, prec=3))
Output
Welcome to the CBC MILP Solver
Version: 2.9.9
Build Date: Jan 15 2018
command line - ICbcModel -solve -quit (default strategy 1)
Continuous objective value is -200 - 0.00 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 20 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 24 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 16 strengthened rows, 0 substitutions
Cgl0004I processed model has 153 rows, 100 columns (100 integer (100 of which binary)) and 380 elements
Cutoff increment increased from 1e-05 to 0.9999
Cbc0038I Initial state - 0 integers unsatisfied sum - 0
Cbc0038I Solution found of -194
Cbc0038I Before mini branch and bound, 100 integers at bound fixed and 0 continuous
Cbc0038I Mini branch and bound did not improve solution (0.01 seconds)
Cbc0038I After 0.01 seconds - Feasibility pump exiting with objective of -194 - took 0.00 seconds
Cbc0012I Integer solution of -194 found by feasibility pump after 0 iterations and 0 nodes (0.01 seconds)
Cbc0001I Search completed - best objective -194, took 0 iterations and 0 nodes (0.01 seconds)
Cbc0035I Maximum depth 0, 0 variables fixed on reduced cost
Cuts at root node changed objective from -194 to -194
Probing was tried 0 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.000 seconds)
Gomory was tried 0 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.000 seconds)
Knapsack was tried 0 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.000 seconds)
Clique was tried 0 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.000 seconds)
MixedIntegerRounding2 was tried 0 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.000 seconds)
FlowCover was tried 0 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.000 seconds)
TwoMirCuts was tried 0 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.000 seconds)
Result - Optimal solution found
Objective value: -194.00000000
Enumerated nodes: 0
Total iterations: 0
Time (CPU seconds): 0.01
Time (Wallclock seconds): 0.01
Total time (CPU seconds): 0.01 (Wallclock seconds): 0.01
CoinOR CBC used 0.013 secs
Complete process used 0.042 secs
Results
Your "large" instance is solved within 0.042 secs (complete time with sub-opt code) and 0.013 secs are spent in the core-solver. Of course this is just one instance and interpreting this is not that scientific!
The result is the same as SaiBot's interesting customized-solution (and your smaller examples)!
(Some earlier code had a scoring-bug, which made me ask Saibot for double-checking his solution, which i can now reproduce!)
Transfer
MIP-solvers should be available on most architectures and environments, probably even on mobiles (with some potential non-trivial build-process). Modelling/Usage of those depends somewhat on the modelling-system and surrounding-software.
Here is my try on the problem. I was not able to come up with a better time complexity but optimized the brute-force.
I processed the nodes one-by-one only allowing for colorings such that no two neighbor nodes have the same color.
I added an upper bound estimation for each intermediate (non complete) coloring. For this I assumed that every non-colored node will be colored in the highest-scoring color (only allowing different colors than the already colored neighbours). So in the upper bound calculation two adjacent nodes that have not been colored yet could both be colored "red". With this estimation I built a branch-and-bound algorithm, that terminates the current search-path when the upper-bound estimation is still lower than the current maximum.
The runtime for the small graph is less than 1 ms, for the medium graph it is 15 ms and for the large graph 3.2 seconds. The results for the three graphs are 46, 77 and 194, respectively.
import time
import copy
def upperBoundScore(graph, dfsGraphOder, dfsIndex, coloring, scoring, currentScore):
maxAdditionalScore = 0;
for i in range(dfsIndex, len(dfsGraphOder)):
neighbourColors = {coloring[node] for node in graph[dfsGraphOder[i]]}
possibleColors = {1, 2, 3, 4} - neighbourColors
if len(possibleColors) < 1: # if for one node no color is available stop
return -1
maxAdditionalScore += scoring[list(possibleColors)[0]]
return currentScore+maxAdditionalScore
def colorRemainingGraph(graph, dfsGraphOder, dfsIndex, coloring, scoring, currentScore):
global maxScore
global bestColoring
# whole graph colored
if dfsIndex == len(dfsGraphOder):
if currentScore > maxScore:
maxScore = currentScore
bestColoring = copy.deepcopy(coloring)
# only proceed if current coloring can get better then best coloring
elif upperBoundScore(graph, dfsGraphOder, dfsIndex, coloring, scoring, currentScore) > maxScore:
neighbourColors ={coloring[node] for node in graph[dfsGraphOder[dfsIndex]]}
possibleColors = list({1, 2, 3, 4} - neighbourColors)
for c in possibleColors:
coloring[dfsGraphOder[dfsIndex]] = c
currentScore += scoring[c]
colorRemainingGraph(graph, dfsGraphOder, dfsIndex+1, coloring, scoring, currentScore)
currentScore -= scoring[c]
coloring[dfsGraphOder[dfsIndex]] = 0
#graph = {2: [4], 4: [2, 3, 14, 13], 3: [4], 14: [4], 13: [4]}
#graph = {2: [4, 5, 6], 4: [2, 3], 3: [4, 18], 5: [2, 6], 6: [5, 2, 13, 18], 13: [6, 20, 21, 22], 18: [6, 3, 20, 22], 20: [18, 13], 22: [18, 13], 21: [13]}
graph = {2: [5, 6, 9], 5: [2, 4], 4: [5, 23], 6: [2, 7, 10], 3: [8, 16], 8: [3, 7, 12], 7: [6, 8, 10, 11], 9: [2, 10], 10: [6, 9, 7, 13, 14, 15, 17, 18], 11: [7, 12, 13], 12: [8, 11, 15, 16, 19], 13: [10, 11, 15], 14: [10, 15], 15: [10, 13, 12, 14, 17, 19], 16: [3, 12, 25, 27], 17: [10, 15, 18], 18: [10, 17, 19, 20], 19: [15, 18, 12, 27], 20: [18, 22, 24, 26, 27, 25], 22: [20], 23: [4, 24, 26], 24: [23, 20], 25: [16, 20], 26: [23, 20], 27: [19, 20, 16]}
# 0 = uncolored, 1 = red, 2 = green, 3 = blue, 4 = Yellow
scoring = {1:10, 2:6, 3:3, 4:1}
coloring = {node: 0 for node in graph.keys()}
nodeOrder = list(graph.keys())
maxScore = 0
bestColoring = {}
start = time.time()
colorRemainingGraph(graph, nodeOrder, 0, coloring, scoring, 0)
end = time.time()
print("Runtime: "+ str(end - start))
print("Max Score: "+str(maxScore))
print(bestColoring)
For the large graph the resulting coloring is (1 = red, 2 = green, 3 = blue, 4 = yellow):
{2: 1, 3: 1, 4: 1, 5: 2, 6: 2, 7: 1, 8: 2, 9: 2, 10: 3, 11: 2, 12: 1, 13: 1, 14: 1, 15: 4, 16: 2, 17: 2, 18: 1, 19: 2, 20: 2, 22: 1, 23: 2, 24: 1, 25: 1, 26: 1, 27: 1}
To verify that the coloring outputed by the algorithm is correct one can use the below code, which checks if any two neighbor nodes have the same color.
def checkSolution(graph, coloring):
validColoring=1
for node in graph:
for neighbour in graph[node]:
if coloring[node] == coloring[neighbour]:
print("wrong coloring found "+ str(node) + " and " + str(neighbour) + " have the same color")
validColoring = 0
if validColoring:
print("Coloring is valid")

How many times a number appears as a leaf node?

Suppose you have an array of n elements
A = {1,2,3,4,5}
total of 5! binary search trees are possible(not necessarily distinct) now my question is in how many of trees 1 appeared as leaf node and in how many 2 appeared as leaf node and so on ?
What I have tried:
I've seen for A = {1,2,3}
2 appears 6/3 = 2 times
1 appears 2+1 = 3 times
3 appears 2+1 = 3 times
can i generalise that and say that,
if A= {1,2,3,4}
2 = 24/4 = 6 times
3 = 24/4 = 6 times
1 = 6+1 = 7 times
4 = 6+1 = 7 times
We can generalize, but not in that way.
You can try to permute the array and produce all possible BST's. A brute-force approach, that returns answer in a map/dictionary data structure shouldn't be that hard. First write a function that given one of permuted arrays, finds all leaves. It takes first element as root, sends all elements less than root to left, all greater ones to right, and calls this function recursively for both of them. It then just returns after combining those values.
In the end, combine values for all possible permutations.
A possible approach in python:
from itertools import permutations
def func(arr):
if not arr: return {}
if len(arr)==1: return {arr[0]}
ans = set()
left = func([v for v in arr[1:] if v<arr[0]])
right = func([v for v in arr[1:] if v>=arr[0]])
ans.update(left)
ans.update(right)
return ans
arr = [1,2,3,4]
ans = {i:0 for i in arr}
for a in permutations(arr):
dic = func(a)
print(a,":",dic)
for k in dic:
ans[k]+=1
print(ans)
for [1,2,3] it outputs:
(1, 2, 3) : {3}
(1, 3, 2) : {2}
(2, 1, 3) : {1, 3}
(2, 3, 1) : {1, 3}
(3, 1, 2) : {2}
(3, 2, 1) : {1}
{1: 3, 2: 2, 3: 3}
for [1,2,3,4], only the last line i.e answer is:
{1: 12, 2: 8, 3: 8, 4: 12}
for [1,2,3,4,5], it is :
{1: 60, 2: 40, 3: 40, 4: 40, 5: 60}
Can you see the pattern? well, one last example. For up to 6 it is:
{1: 360, 2: 240, 3: 240, 4: 240, 5: 240, 6: 360}

Pandas distance matrix performance with vector data

Even if I found a few threads dealing with distance matrix efficiency, they all use either an int or float matrix. In my case I have to deal with vectors (orderedDict of frequency), and I only end up with a very slow method that is not viable with a large DataFrame (300,000 x 300,000).
How to make the process more optimized?
I would be very thankful for any help, this problem has been killing me :)
Considering DataFrame df such as:
>>> df
vectors
id
1 {dict1}
2 {dict2}
3 {dict3}
4 {dict4}
where {dict#}
orderedDict{event1: 1,
event2: 5,
event3: 0,
...}
A function to return the distance between two vectors:
def vectorDistance(a, b, df_vector):
# Calculate distance between a & b
# based on the vector from df_vector.
return distance
[in]: vectorDistance({dict1}, {dict2})
[out]: distance
A desired Output:
1 2 3 4
id
1 0 1<->2 1<->3 1<->4
2 1<->2 0 ... ...
3 1<->3 ... 0 ...
4 1<->4 ... ... 0
(where 1<->2 is a float distance between vector 1 & 2)
Method used:
import pandas as pd
matrix = pd.concat([df, df.T], axis=1)
for index in matrix.index:
for col in matrix.columns:
matrix.ix[col, index] = vectorDistance(col, index, df)
>>> matrix
5072142538 5072134420 4716823618 ...
udid
5072142538 0.00000 0.01501 0.06002 ...
5072134420 0.01501 0.00000 0.09037 ...
4716823618 0.06002 0.09037 0.00000 ...
... ... ... ...
EDIT:
Minimal example
Note: The event can differ form one {dict} to another, but it's ok when passed in the function. My issue is more how to pass the right a & b to fill the cell in a fast way.
I am working with cosine distance as it's rather good with vectors such as mine.
from collections import Counter
import pandas as pd
from math import sqrt
raw_data = {'counters_': {4716823618: Counter({51811: 1, 51820: 1, 51833: 56, 51835: 8, 51843: 48, 51848: 2, 51852: 8, 51853: 5, 51854: 4, 51856: 24, 51903: 11, 51904: 12, 51905: 3, 51906: 19, 51908: 230, 51922: 24, 51927: 19, 51931: 2, 106282: 9, 112830: 1, 119453: 1, 165062: 80, 168904: 3, 180354: 19, 180437: 33, 185824: 117, 186171: 14, 187101: 1, 190827: 7, 201629: 1, 209318: 37}), 5072134420: Counter({51811: 1, 51812: 1, 51820: 1, 51833: 56, 51835: 9, 51843: 49, 51848: 2, 51852: 11, 51853: 4, 51854: 4, 51856: 28, 51885: 1, 51903: 17, 51904: 17, 51905: 9, 51906: 14, 51908: 225, 51927: 29, 51931: 2, 106282: 19, 112830: 2, 168904: 9, 180354: 14, 185824: 219, 186171: 7, 187101: 1, 190827: 6, 201629: 2, 209318: 41}), 5072142538: Counter({51811: 4, 51812: 4, 51820: 4, 51833: 56, 51835: 8, 51843: 48, 51848: 2, 51852: 6, 51853: 3, 51854: 3, 51856: 18, 51885: 1, 51903: 17, 51904: 16, 51905: 3, 51906: 24, 51908: 258, 51927: 20, 51931: 8, 106282: 16, 112830: 2, 168904: 3, 180354: 24, 185824: 180, 186171: 10, 187101: 1, 190827: 7, 201629: 2, 209318: 52})}}
def vectorDistance(index, col):
a = dict(df[df.index == index]["counters_"].values[0])
b = dict(df[df.index == col]["counters_"].values[0])
return abs(np.round(1-(similarity(a,b)),5))
def scalar(collection):
total = 0
for coin, count in collection.items():
total += count * count
return sqrt(total)
def similarity(A,B):
total = 0
for kind in A:
if kind in B:
total += A[kind] * B[kind]
return float(total) / (scalar(A) * scalar(B))
df = pd.DataFrame(raw_data)
matrix = pd.concat([df, df.T], axis=1)
matrix.drop("counters_",0,inplace=True)
matrix.drop("counters_",1,inplace=True)
for index in matrix.index:
for col in matrix.columns:
matrix.ix[col,index] = vectorDistance(col,index)
matrix
This is certainly more efficient and easier to read than using for loops.
df = pd.DataFrame([v for v in raw_data['counters_'].values()],
index=raw_data['counters_'].keys()).T
>>> df.head()
4716823618 5072134420 5072142538
51811 1 1 4
51812 NaN 1 4
51820 1 1 4
51833 56 56 56
51835 8 9 8
# raw_data no longer needed. Delete to reduce memory footprint.
del raw_data
# Create scalars.
scalars = ((df ** 2).sum()) ** .5
>>> scalars
4716823618 289.679133
5072134420 330.548030
5072142538 331.957829
dtype: float64
def v_dist(col_1, col_2):
return 1 - ((df.iloc[:, col_1] * df.iloc[:, col_2]).sum() /
(scalars.iloc[col_1] * scalars.iloc[col_2]))
>>> v_dist(0, 1)
0.09036665882900885
>>> v_dist(0, 2)
0.060016436804916085
>>> v_dist(1, 2)
0.015009898476505357
m = pd.DataFrame(np.nan * len(df.columns), index=df.columns, columns=df.columns)
>>> m
4716823618 5072134420 5072142538
4716823618 NaN NaN NaN
5072134420 NaN NaN NaN
5072142538 NaN NaN NaN
for row in range(m.shape[0]):
for col in range(row, m.shape[1]): # Note: m.shape[0] equals m.shape[1]
if row == col:
# No need to calculate value for diagonal.
m.iat[row, col] = 0
else:
# Do two calculation in one due to symmetry.
m.iat[row, col] = m.iat[col, row] = v_dist(row, col)
>>> m
4716823618 5072134420 5072142538
4716823618 0.000000 0.090367 0.060016
5072134420 0.090367 0.000000 0.015010
5072142538 0.060016 0.015010 0.000000
Wrapping all of this into a function:
def calc_matrix(raw_data):
df = pd.DataFrame([v for v in raw_data['counters_'].values()],
index=raw_data['counters_'].keys()).T
scalars = ((df ** 2).sum()) ** .5
m = pd.DataFrame(np.nan * len(df.columns), index=df.columns, columns=df.columns)
for row in range(m.shape[0]):
for col in range(row, m.shape[1]):
if row == col:
m.iat[row, col] = 0
else:
m.iat[row, col] = m.iat[col, row] = (1 -
(df.iloc[:, row] * df.iloc[:, col]).sum() /
(scalars.iloc[row] * scalars.iloc[col]))
return m
You don't want to store dicts inside your dataframe. Read in your dataframe using the from_dict method:
df = pd.DataFrame.from_dict(raw_data['counters_'],orient='index')
Then you can apply the numpy/scipy vectorised methods for computing cosine similarity as in What's the fastest way in Python to calculate cosine similarity given sparse matrix data?

Counting unvisited nodes at distance n for every node in graph

For each point in a large graph I am trying to create a list that contains the number of unvisited nodes at distance n from the starting node. An example output is:
[1,3,6]
which means that at distance 0 there is the starting node itself, at distance 1 there are 3 new (unexplored) nodes, etc.
If you have only one starting point, this is fairly easy: you just augment a shell counter on top of a breadth-first search. The problem starts when I have to do this for every node in my graph. Because my graph is large (> 100000 nodes), it becomes rather slow to do the above routine for every point.
My first attempt to optimize this was to check if the list at node a could be constructed from the lists of all the neighbours of a, but so far I've had no luck, partly due to cycles in the graph. I am hoping that some of you may have some nice ideas, maybe involving some additional information I can cache.
My question: is there a way to optimize such a search if you know that you will have to do it for every node?
It seems unlikely that there is a solution in less than O(n*|V|^2), so here is an approach in Python that seems not too terrible.
# some basic topologies
def lineE(N):
return(set((i,i+1) for i in range(N-1)))
def ringE(N):
return(lineE(N).union([(0,N-1)]))
def fullE(N):
return(set([(i,j) for i in range(N) for j in range(i)]))
# propagate visitors from x to y
def propagate(V, curr, x, y, d):
nexty = set()
for cx in curr[x]:
if not cx in V[y]["seen"]:
V[y]["seen"].add(cx)
V[y]["foaf"][d] = V[y]["foaf"].get(d,0) + 1
nexty.add(cx)
return(nexty)
# run for D iterations
def mingle(N, E, D):
V = dict((i, {"seen":set([i]), "foaf":{0:1}}) for i in range(N))
curr = dict((i, set([i])) for i in range(N))
for d in range(1, min(D+1, N)):
next = dict((i, set()) for i in range(N))
for (h, t) in E:
next[t] = next[t].union(propagate(V, curr, h, t, d))
next[h] = next[h].union(propagate(V, curr, t, h, d))
curr = next
return(V)
Trying it out with 10 nodes and distance 3,
N=10
D=3
for (topology, E) in [("line", lineE(N)), ("ring", ringE(N)), ("full", fullE(N))]:
V = mingle(N, E, D)
print "\n", topology
for v in V:
print v, V[v]["foaf"]
we get
line
0 {0: 1, 1: 1, 2: 1, 3: 1}
1 {0: 1, 1: 2, 2: 1, 3: 1}
2 {0: 1, 1: 2, 2: 2, 3: 1}
3 {0: 1, 1: 2, 2: 2, 3: 2}
4 {0: 1, 1: 2, 2: 2, 3: 2}
5 {0: 1, 1: 2, 2: 2, 3: 2}
6 {0: 1, 1: 2, 2: 2, 3: 2}
7 {0: 1, 1: 2, 2: 2, 3: 1}
8 {0: 1, 1: 2, 2: 1, 3: 1}
9 {0: 1, 1: 1, 2: 1, 3: 1}
ring
0 {0: 1, 1: 2, 2: 2, 3: 2}
1 {0: 1, 1: 2, 2: 2, 3: 2}
2 {0: 1, 1: 2, 2: 2, 3: 2}
3 {0: 1, 1: 2, 2: 2, 3: 2}
4 {0: 1, 1: 2, 2: 2, 3: 2}
5 {0: 1, 1: 2, 2: 2, 3: 2}
6 {0: 1, 1: 2, 2: 2, 3: 2}
7 {0: 1, 1: 2, 2: 2, 3: 2}
8 {0: 1, 1: 2, 2: 2, 3: 2}
9 {0: 1, 1: 2, 2: 2, 3: 2}
full
0 {0: 1, 1: 9}
1 {0: 1, 1: 9}
2 {0: 1, 1: 9}
3 {0: 1, 1: 9}
4 {0: 1, 1: 9}
5 {0: 1, 1: 9}
6 {0: 1, 1: 9}
7 {0: 1, 1: 9}
8 {0: 1, 1: 9}
9 {0: 1, 1: 9}
which seems correct. Also, running the simple topologies for distance 100 with 100000 nodes takes about a minute on my laptop. Of course if you have a dense graph (like fullE) this will blow up.
N=100000
D=100
for (topology, E) in [("line", lineE(N)), ("ring", ringE(N))]:
V = mingle(N, E, D)

Resources