-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathgraph_analyzed_txts.py
369 lines (282 loc) · 15.2 KB
/
graph_analyzed_txts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
## Author: Seyyid Hikmet Celik
## This script traverse on the analyzed texts of queries and makes stacked bar chart from them.
## Below commands are executing on the texts and then it extracts info from them
# cat /home/guest/bsc/tpcds_queries/analyzing_txts/q23.txt | plan-exporter --target=depesz --auto-confirm
# echo $(wget https://explain.depesz.com/s/iPQ9#stats -q -O -)
######### REQUIRED INSTALLATIONS #########
## Before running you need to install plan-exporter tool like this:
# wget https://github.com/agneum/plan-exporter/releases/download/v0.0.5/plan-exporter-0.0.5-linux-amd64.tar.gz
# tar -zxvf plan-exporter-0.0.5-linux-amd64.tar.gz
# sudo mv plan-exporter-*/plan-exporter /usr/local/bin/
# rm -rf ./plan-exporter-*
## If cannot find the python modules:
# pip3 install matplotlib
# pip3 install BeautifulSoup4
# pip3 install numpy
# pip3 install pprint
# pip3 install lxml
##### END OF REQUIRED INSTALLATIONS #####
######### REQUIRED CHANGES #########
## Analyzed txts path
## Change this path
atxts_path = '/home/guest/txts/server/1gb4shared/actxts/' #'/home/guest/txts/dell/1gb/atxtsindexed/' #'/home/guest/denegenyeni/tmpq0/atxts/' #'/home/guest/denegenyeni/tmpq0/atxts/' #'/home/guest/bsc/tpcds_10gb/atxts10gb/' #'/home/guest/bsc/tpcds_queries/analyzing_txts/'
## Also change these, if you need
txt_pfx = 'q' ## e.g. q23.txt, prefix of text before the query number
txt_sfx = 'a.txt' ## suffix after the query number
## Saved tables path
tabletxts_path = '/home/guest/tables1gb2/'
tabletxt_pfx = 'qatable'
tabletxt_sfx = '.txt'
## If there is less query, then change this too
maxnumofqueries = 99
numofqueries = maxnumofqueries
plot_title_name = 'The Most Consumer Functions in TPC-DS Queries - 1GB'
pdf_name = 'tpcds1gb'
planexportertool_path = 'plan-exporter-tool/'
##### END OF REQUIRED CHANGES #####
import os
## Create if the directory does not exist
if not os.path.exists(tabletxts_path):
os.mkdir(tabletxts_path)
import subprocess
import re
from bs4 import BeautifulSoup as bs
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import argparse
import pprint
query_list = []
## You can use this command line argument e.g.:
# python3 allinonce.py --hlfunc="Sort"
## to highlight desired function in the bar chart
parser = argparse.ArgumentParser(description='Create Configuration')
parser.add_argument('-dz', '--depesz', action='store_true', help='Firstly upload to depesz analyzed txts', default=False)
parser.add_argument('-hf', '--hlfunc', type=str, help='Specify function to highlight it', default='')
parser.add_argument('-ql', '--querylist', type=str, help='Specify query list for special graphs', default='')
parser.add_argument('-p', '--part', type=str, help='Specify a part (1-10) to make a smaller part of queries graph', default='')
parser.add_argument('-bt', '--bottomed', action='store_true', help='Make bottomed highlighted function graphs', default=False)
args = parser.parse_args()
if ''.join(args.querylist.split()) != '':
query_list = [each_number.strip() for each_number in args.querylist.split(',')]
query_list[:] = [int(x) for x in query_list if x.strip()]
which_part = 1
if args.part != '':
if int(args.part) <= 10 and int(args.part) >= 1:
which_part = int(args.part)
if args.querylist == '':
query_list = list(range(10*(which_part-1)+1, 10*(which_part-1)+ (11 if which_part != 10 else 10)))
if len(query_list) > 0 and len(query_list) < maxnumofqueries+1:
numofqueries = len(query_list)
## If you want to add a function adding to functions list is sufficient, it will generate extra colors automatically
## If you want to remove a function removing from function list is sufficient, it will remove extra colors
## However, if you want to match colors and functions one-to-one, you need to add colors to colors list as you added functions to functions list
## Always, just leave 'Other' as last element
functions = ['Parallel Hash Join', 'Sort', 'Index Scan', 'Index Only Scan', 'Parallel Seq Scan', 'Gather', 'Gather Merge', 'Bitmap Heap Scan', 'CTE Scan', 'Partial HashAggregate', 'Seq Scan', 'HashAggregate', 'Finalize GroupAggregate', 'MixedAggregate', 'Hash Join', 'Other'] #['Other'] #['Parallel Hash Join', 'Sort', 'Index Scan', 'Index Only Scan', 'Parallel Seq Scan', 'Gather', 'Gather Merge', 'Bitmap Heap Scan', 'CTE Scan', 'Partial HashAggregate', 'Seq Scan', 'HashAggregate', 'Finalize GroupAggregate', 'MixedAggregate', 'Hash Join', 'Other']
colors=['blue', 'green', 'red', '#F5DEB3', 'cyan', 'magenta', 'yellow', '#800000', '#FF8C00', '#00FF7F', '#4682B4', '#800080', '#8B4513', '#696969', '#808000', 'black']
## These colors for highlighting desired function
h1color = 'blue' ## desired function color
h2color = '#808080' ## other functions color
## 1 to 99 (numofqueries) numbers for x axis of bar plot
xlist = []
## Number of functions list each with numofqueries items filled with 0 (default 16x99 matrix)
## This list is using to save percentages of each desired function for each query
ylist = [[0 for x in range(numofqueries)] for y in range(len(functions))]
## A list to save percentages of the most consumer functions for each query
maxylist = []
## A list to save names of the most consumer functions for each query
maxnamelist = []
## Even in one exception do not order
dont_order = False
for count in range(1, numofqueries+1):
try:
query_number = query_list[count-1] if numofqueries != maxnumofqueries else count
max_prcnt = 0
name_of_max = ''
if args.depesz:
table_txt = open(tabletxts_path + tabletxt_pfx + query_number.__str__() + tabletxt_sfx, 'w')
## Uploads each analyzed text to explain.depesz.com
upload_to_depesz = subprocess.getoutput("cat " + atxts_path + txt_pfx + query_number.__str__() + txt_sfx + " | " + planexportertool_path + "plan-exporter --target=depesz --auto-confirm")
## Saves given url after uploading
url = re.search('URL: (.*)', upload_to_depesz).group(1)
url = url + '#stats'
## Getting the xml source of the page from given url
get_depesz_source = subprocess.getoutput("echo $(wget " + url + " -q -O -)")
## Regexing the xml to find desired table in the page
xml = re.sub(r"\b(Per node type stats.*?)\b\btable", r"\1table2", get_depesz_source)
xml = re.sub(r"\b(Per node type stats.*?)\b\btable\>", r"\1table2>", xml)
## Parsing xml and getting the rows of the table
parse_xml = bs(xml, 'lxml')
find_table = parse_xml.find('table2')
table_rows = find_table.find_all('tr')
## Traversing on the table's rows and columns and save them to the lists
for i in table_rows[1:]:
table_data = i.find_all('td')
data = [j.text for j in table_data]
table_txt.write(data[0] + '|' + data[1] + '|' + data[2] + '|' + data[3] + '\n')
table_txt.flush()
table_txt.close()
## Prints depesz url, the most consumer functions and the percentages for each query
## Keep in mind that max percentages are not normalized while printing, they are normalizing in the bar chart
print(txt_pfx + query_number.__str__() + txt_sfx + ' --> ' + url)
table_txt = open(tabletxts_path + tabletxt_pfx + query_number.__str__() + tabletxt_sfx, 'r')
for each_row in table_txt:
data = each_row.split('|')
each_prcnt = float(data[3].replace('%', '').strip())
if each_prcnt > max_prcnt:
max_prcnt = each_prcnt
name_of_max = data[0]
table_txt.close()
maxnamelist.append(name_of_max)
maxylist.append(max_prcnt)
## If there is an exception above, probably the analyzed text is empty or not found because of the syntax errors in the queries
except Exception as e:
maxnamelist.append('empty')
maxylist.append(0)
dont_order = True
maxnamedict = {i:maxnamelist.count(i) for i in maxnamelist}
sorted_maxnamedict = sorted(maxnamedict.items(), key=lambda x:x[1], reverse=True)
## TODO I can do it via also empty functions list, we need to think about that
## If most consumer function not in list then append functions list
for each_pair in sorted_maxnamedict[::-1]:
if each_pair[0] not in functions:
if len(functions) <= 1: ## either there is other in the list or empty
functions.insert(0, each_pair[0])
else:
functions.insert(functions.index(functions[-2]), each_pair[0])
ylist.append([0]*numofqueries)
for count in range(1, numofqueries+1):
try:
query_number = query_list[count-1] if numofqueries != maxnumofqueries else count
## Saves query numbers for x axis of the plot
xlist.append(query_number.__str__())
max_prcnt = 0
name_of_max = ''
table_txt = open(tabletxts_path + tabletxt_pfx + query_number.__str__() + tabletxt_sfx, 'r')
for each_row in table_txt:
data = each_row.split('|')
each_prcnt = float(data[3].replace('%', '').strip())
if data[0] in functions:
ylist[functions.index(data[0])][count-1] = float(data[3].replace('%', '').strip())
else: ## For other functions
ylist[functions.index(functions[-1])][count-1] += float(data[3].replace('%', '').strip())
if each_prcnt > max_prcnt:
max_prcnt = each_prcnt
name_of_max = data[0]
table_txt.close()
## Prints depesz url, the most consumer functions and the percentages for each query
## Keep in mind that max percentages are not normalized while printing, they are normalizing in the bar chart
print(tabletxt_pfx + query_number.__str__() + tabletxt_sfx + ' --> ' + name_of_max + ' ' + max_prcnt.__str__())
## If there is an exception above, probably the analyzed text is empty or not found because of the syntax errors in the queries
except Exception as e:
print(e)
print(tabletxt_pfx + query_number.__str__() + tabletxt_sfx + ' --> empty')
print()
print()
## This prints number of the most consumer functions out of 99 (numofqueries) query in descending order
pprint.pprint(sorted_maxnamedict)
print(functions)
## This is for if you add extra functions to functions list and don't specify the color for them, it generates random colors for them.
import random
random.seed(1487) ## generate same colors for same number of functions by seeding random
get_rand_colors = lambda n: list(map(lambda i: "#" + "%06x" % random.randint(0, 0xFFFFFF),range(n)))
## Either functions removed or added to functions list it arranges the colors list accordingly, and for example, after adding extra functions, it adds extra colors randomly
if len(functions) > len(colors):
extra_colors = get_rand_colors(len(functions) - len(colors))
for each_color in extra_colors:
colors.insert(colors.index(colors[-2]), each_color)
elif len(functions) < len(colors):
for i in range(0, len(colors) - len(functions)):
colors.pop(colors.index(colors[-2]))
## Order lists by max consumer functions
if not dont_order:
temp_functions = functions
for each_pair in sorted_maxnamedict[::-1]:
ylist.insert(0, ylist.pop(temp_functions.index(each_pair[0])))
colors.insert(0, colors.pop(temp_functions.index(each_pair[0])))
functions.remove(each_pair[0]) ## remove max names from functions
functions.insert(0, each_pair[0]) ## insert them as reversed order in the beginning of list
print()
## This is normalizing data to 100 percent for bar plot, otherwise overlapping occurs and percentages are exceeding 100 percent
## Remove this if you dont want to normalize
for i in range(0, numofqueries):
eachq_prcnt_sum = 0
for j in range(0, len(functions)):
eachq_prcnt_sum += ylist[j][i]
if eachq_prcnt_sum > 100:
rate = eachq_prcnt_sum / 100.0
for j in range(0, len(functions)):
ylist[j][i] = ylist[j][i] / rate
## If e.g. --hlfunc="Sort" option is given from the command line, then this makes the colors two seperate color as "Sort" and the others
highlight = False
if args.hlfunc in functions:
highlight = True
skip_index = functions.index(args.hlfunc)
for i in range(0, len(functions)):
if i != skip_index:
colors[i] = h2color #'#808080' #'black'
else:
colors[i] = h1color #'blue'
## make bottomed
if highlight and args.bottomed:
## bir tane degistirecegim icin burada, index karismayacak yoksa temp tutmak lazim
ylist.insert(0, ylist.pop(functions.index(args.hlfunc)))
colors.insert(0, colors.pop(functions.index(args.hlfunc)))
functions.remove(args.hlfunc) ## remove max names from functions
functions.insert(0, args.hlfunc) ## insert them as reversed order in the beginning
plt.rcParams["font.size"] = "18"
## Plotting stacked bar chart according to list of functions
bars1 = plt.bar(xlist, ylist[0], color=colors[0])
cumulative_list = np.array(ylist[0])
count = 0
for eachylist in ylist[1:]:
count += 1
plt.bar(xlist, eachylist, color=colors[count], bottom=cumulative_list)
cumulative_list = cumulative_list + np.array(eachylist)
## At the top of the chart, this is writing max percentage for each bar, for the most consumer function in each query
count = 0
for each_bar in bars1:
plt.text(each_bar.get_x() + each_bar.get_width() / 2.0, 110, '% ' + maxylist[count].__str__(), color='black', ha='center', va='center', rotation='vertical', fontsize=18)
count = count + 1
## Plot title and preferences
if highlight:
plt.title(plot_title_name + ' - ' + args.hlfunc.strip(), fontsize=30)
else:
plt.title(plot_title_name, fontsize=30)
plt.xlabel('Queries', fontsize=30)
plt.ylabel('Percentage', fontsize=30)
plt.xticks(rotation=75)
plt.grid()
## Add legends automatically to the plot
patch_list = []
i = 0
if highlight:
patch_list.append(mpatches.Patch(label=args.hlfunc, color=h1color))
patch_list.append(mpatches.Patch(label='Other', color=h2color))
else:
for each_func in functions:
patch_list.append(mpatches.Patch(label=each_func.replace(' ', '\n'), color=colors[i]))
i += 1
plt.legend(handles=patch_list, fontsize=22, loc=(0.96, 0))
#plt.rc('font', size=20) # controls default text sizes
#plt.rc('axes', titlesize=20) # fontsize of the axes title
#plt.rc('axes', labelsize=20) # fontsize of the x and y labels
#plt.rc('xtick', labelsize=20) # fontsize of the tick labels
#plt.rc('ytick', labelsize=20) # fontsize of the tick labels
#plt.rc('legend', fontsize=20) # legend fontsize
#plt.rc('figure', titlesize=20) # fontsize of the figure title
#plt.rcParams.update({'font.size': 22})
## Maximize the plot window
figure = plt.gcf()
figure.set_size_inches(32,18)
## Tight the graph
plt.tight_layout(pad=0)
## Save the plot to pdf
if highlight:
plt.savefig(pdf_name + '_' + ''.join(args.hlfunc.split()) + '.pdf', dpi=300)
plt.savefig(pdf_name + '_' + ''.join(args.hlfunc.split()) + '.png', dpi=300)
else:
plt.savefig(pdf_name + '.pdf', dpi=300)
plt.savefig(pdf_name + '.png', dpi=300)
## Show the plot
#plt.show()