dbinit.py revision 1772:a3a83e812a5e
1# Copyright (c) 2003-2004 The Regents of The University of Michigan 2# All rights reserved. 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions are 6# met: redistributions of source code must retain the above copyright 7# notice, this list of conditions and the following disclaimer; 8# redistributions in binary form must reproduce the above copyright 9# notice, this list of conditions and the following disclaimer in the 10# documentation and/or other materials provided with the distribution; 11# neither the name of the copyright holders nor the names of its 12# contributors may be used to endorse or promote products derived from 13# this software without specific prior written permission. 14# 15# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 27import MySQLdb 28 29class MyDB(object): 30 def __init__(self, options): 31 self.name = options.db 32 self.host = options.host 33 self.user = options.user 34 self.passwd = options.passwd 35 self.mydb = None 36 self.cursor = None 37 38 def admin(self): 39 self.close() 40 self.mydb = MySQLdb.connect(db='mysql', host=self.host, user=self.user, 41 passwd=self.passwd) 42 self.cursor = self.mydb.cursor() 43 44 def connect(self): 45 self.close() 46 self.mydb = MySQLdb.connect(db=self.name, host=self.host, 47 user=self.user, passwd=self.passwd) 48 self.cursor = self.mydb.cursor() 49 50 def close(self): 51 if self.mydb is not None: 52 self.mydb.close() 53 self.cursor = None 54 55 def query(self, sql): 56 self.cursor.execute(sql) 57 58 def drop(self): 59 self.query('DROP DATABASE IF EXISTS %s' % self.name) 60 61 def create(self): 62 self.query('CREATE DATABASE %s' % self.name) 63 64 def populate(self): 65 # 66 # Each run (or simulation) gets its own entry in the runs table to 67 # group stats by where they were generated 68 # 69 # COLUMNS: 70 # 'id' is a unique identifier for each run to be used in other 71 # tables. 72 # 'name' is the user designated name for the data generated. It is 73 # configured in the simulator. 74 # 'user' identifies the user that generated the data for the given 75 # run. 76 # 'project' another name to identify runs for a specific goal 77 # 'date' is a timestamp for when the data was generated. It can be 78 # used to easily expire data that was generated in the past. 79 # 'expire' is a timestamp for when the data should be removed from 80 # the database so we don't have years worth of junk. 81 # 82 # INDEXES: 83 # 'run' is indexed so you can find out details of a run if the run 84 # was retreived from the data table. 85 # 'name' is indexed so that two all run names are forced to be unique 86 # 87 self.query(''' 88 CREATE TABLE runs( 89 rn_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT, 90 rn_name VARCHAR(200) NOT NULL, 91 rn_sample VARCHAR(32) NOT NULL, 92 rn_user VARCHAR(32) NOT NULL, 93 rn_project VARCHAR(100) NOT NULL, 94 rn_date TIMESTAMP NOT NULL, 95 rn_expire TIMESTAMP NOT NULL, 96 PRIMARY KEY (rn_id), 97 UNIQUE (rn_name,rn_sample) 98 ) TYPE=InnoDB''') 99 100 # 101 # We keep the bin names separate so that the data table doesn't get 102 # huge since bin names are frequently repeated. 103 # 104 # COLUMNS: 105 # 'id' is the unique bin identifer. 106 # 'name' is the string name for the bin. 107 # 108 # INDEXES: 109 # 'bin' is indexed to get the name of a bin when data is retrieved 110 # via the data table. 111 # 'name' is indexed to get the bin id for a named bin when you want 112 # to search the data table based on a specific bin. 113 # 114 self.query(''' 115 CREATE TABLE bins( 116 bn_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT, 117 bn_name VARCHAR(255) NOT NULL, 118 PRIMARY KEY(bn_id), 119 UNIQUE (bn_name) 120 ) TYPE=InnoDB''') 121 122 # 123 # The stat table gives us all of the data for a particular stat. 124 # 125 # COLUMNS: 126 # 'stat' is a unique identifier for each stat to be used in other 127 # tables for references. 128 # 'name' is simply the simulator derived name for a given 129 # statistic. 130 # 'descr' is the description of the statistic and what it tells 131 # you. 132 # 'type' defines what the stat tells you. Types are: 133 # SCALAR: A simple scalar statistic that holds one value 134 # VECTOR: An array of statistic values. Such a something that 135 # is generated per-thread. Vectors exist to give averages, 136 # pdfs, cdfs, means, standard deviations, etc across the 137 # stat values. 138 # DIST: Is a distribution of data. When the statistic value is 139 # sampled, its value is counted in a particular bucket. 140 # Useful for keeping track of utilization of a resource. 141 # (e.g. fraction of time it is 25% used vs. 50% vs. 100%) 142 # VECTORDIST: Can be used when the distribution needs to be 143 # factored out into a per-thread distribution of data for 144 # example. It can still be summed across threads to find 145 # the total distribution. 146 # VECTOR2D: Can be used when you have a stat that is not only 147 # per-thread, but it is per-something else. Like 148 # per-message type. 149 # FORMULA: This statistic is a formula, and its data must be 150 # looked up in the formula table, for indicating how to 151 # present its values. 152 # 'subdata' is potentially used by any of the vector types to 153 # give a specific name to all of the data elements within a 154 # stat. 155 # 'print' indicates whether this stat should be printed ever. 156 # (Unnamed stats don't usually get printed) 157 # 'prereq' only print the stat if the prereq is not zero. 158 # 'prec' number of decimal places to print 159 # 'nozero' don't print zero values 160 # 'nonan' don't print NaN values 161 # 'total' for vector type stats, print the total. 162 # 'pdf' for vector type stats, print the pdf. 163 # 'cdf' for vector type stats, print the cdf. 164 # 165 # The Following are for dist type stats: 166 # 'min' is the minimum bucket value. Anything less is an underflow. 167 # 'max' is the maximum bucket value. Anything more is an overflow. 168 # 'bktsize' is the approximate number of entries in each bucket. 169 # 'size' is the number of buckets. equal to (min/max)/bktsize. 170 # 171 # INDEXES: 172 # 'stat' is indexed so that you can find out details about a stat 173 # if the stat id was retrieved from the data table. 174 # 'name' is indexed so that you can simply look up data about a 175 # named stat. 176 # 177 self.query(''' 178 CREATE TABLE stats( 179 st_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT, 180 st_name VARCHAR(255) NOT NULL, 181 st_descr TEXT NOT NULL, 182 st_type ENUM("SCALAR", "VECTOR", "DIST", "VECTORDIST", 183 "VECTOR2D", "FORMULA") NOT NULL, 184 st_print BOOL NOT NULL, 185 st_prereq SMALLINT UNSIGNED NOT NULL, 186 st_prec TINYINT NOT NULL, 187 st_nozero BOOL NOT NULL, 188 st_nonan BOOL NOT NULL, 189 st_total BOOL NOT NULL, 190 st_pdf BOOL NOT NULL, 191 st_cdf BOOL NOT NULL, 192 st_min DOUBLE NOT NULL, 193 st_max DOUBLE NOT NULL, 194 st_bktsize DOUBLE NOT NULL, 195 st_size SMALLINT UNSIGNED NOT NULL, 196 PRIMARY KEY (st_id), 197 UNIQUE (st_name) 198 ) TYPE=InnoDB''') 199 200 # 201 # This is the main table of data for stats. 202 # 203 # COLUMNS: 204 # 'stat' refers to the stat field given in the stat table. 205 # 206 # 'x' referrs to the first dimension of a multi-dimensional stat. For 207 # a vector, x will start at 0 and increase for each vector 208 # element. 209 # For a distribution: 210 # -1: sum (for calculating standard deviation) 211 # -2: sum of squares (for calculating standard deviation) 212 # -3: total number of samples taken (for calculating 213 # standard deviation) 214 # -4: minimum value 215 # -5: maximum value 216 # -6: underflow 217 # -7: overflow 218 # 'y' is used by a VECTORDIST and the VECTOR2D to describe the second 219 # dimension. 220 # 'run' is the run that the data was generated from. Details up in 221 # the run table 222 # 'tick' is a timestamp generated by the simulator. 223 # 'bin' is the name of the bin that the data was generated in, if 224 # any. 225 # 'data' is the actual stat value. 226 # 227 # INDEXES: 228 # 'stat' is indexed so that a user can find all of the data for a 229 # particular stat. It is not unique, because that specific stat 230 # can be found in many runs, bins, and samples, in addition to 231 # having entries for the mulidimensional cases. 232 # 'run' is indexed to allow a user to remove all of the data for a 233 # particular execution run. It can also be used to allow the 234 # user to print out all of the data for a given run. 235 # 236 self.query(''' 237 CREATE TABLE data( 238 dt_stat SMALLINT UNSIGNED NOT NULL, 239 dt_x SMALLINT NOT NULL, 240 dt_y SMALLINT NOT NULL, 241 dt_run SMALLINT UNSIGNED NOT NULL, 242 dt_tick BIGINT UNSIGNED NOT NULL, 243 dt_bin SMALLINT UNSIGNED NOT NULL, 244 dt_data DOUBLE NOT NULL, 245 INDEX (dt_stat), 246 INDEX (dt_run), 247 UNIQUE (dt_stat,dt_x,dt_y,dt_run,dt_tick,dt_bin) 248 ) TYPE=InnoDB;''') 249 250 # 251 # Names and descriptions for multi-dimensional stats (vectors, etc.) 252 # are stored here instead of having their own entry in the statistics 253 # table. This allows all parts of a single stat to easily share a 254 # single id. 255 # 256 # COLUMNS: 257 # 'stat' is the unique stat identifier from the stat table. 258 # 'x' is the first dimension for multi-dimensional stats 259 # corresponding to the data table above. 260 # 'y' is the second dimension for multi-dimensional stats 261 # corresponding to the data table above. 262 # 'name' is the specific subname for the unique stat,x,y combination. 263 # 'descr' is the specific description for the uniqe stat,x,y 264 # combination. 265 # 266 # INDEXES: 267 # 'stat' is indexed so you can get the subdata for a specific stat. 268 # 269 self.query(''' 270 CREATE TABLE subdata( 271 sd_stat SMALLINT UNSIGNED NOT NULL, 272 sd_x SMALLINT NOT NULL, 273 sd_y SMALLINT NOT NULL, 274 sd_name VARCHAR(255) NOT NULL, 275 sd_descr TEXT, 276 UNIQUE (sd_stat,sd_x,sd_y) 277 ) TYPE=InnoDB''') 278 279 280 # 281 # The formula table is maintained separately from the data table 282 # because formula data, unlike other stat data cannot be represented 283 # there. 284 # 285 # COLUMNS: 286 # 'stat' refers to the stat field generated in the stat table. 287 # 'formula' is the actual string representation of the formula 288 # itself. 289 # 290 # INDEXES: 291 # 'stat' is indexed so that you can just look up a formula. 292 # 293 self.query(''' 294 CREATE TABLE formulas( 295 fm_stat SMALLINT UNSIGNED NOT NULL, 296 fm_formula BLOB NOT NULL, 297 PRIMARY KEY(fm_stat) 298 ) TYPE=InnoDB''') 299 300 # 301 # Each stat used in each formula is kept in this table. This way, if 302 # you want to print out a particular formula, you can simply find out 303 # which stats you need by looking in this table. Additionally, when 304 # you remove a stat from the stats table and data table, you remove 305 # any references to the formula in this table. When a formula is no 306 # longer referred to, you remove its entry. 307 # 308 # COLUMNS: 309 # 'stat' is the stat id from the stat table above. 310 # 'child' is the stat id of a stat that is used for this formula. 311 # There may be many children for any given 'stat' (formula) 312 # 313 # INDEXES: 314 # 'stat' is indexed so you can look up all of the children for a 315 # particular stat. 316 # 'child' is indexed so that you can remove an entry when a stat is 317 # removed. 318 # 319 self.query(''' 320 CREATE TABLE formula_ref( 321 fr_stat SMALLINT UNSIGNED NOT NULL, 322 fr_run SMALLINT UNSIGNED NOT NULL, 323 UNIQUE (fr_stat,fr_run), 324 INDEX (fr_stat), 325 INDEX (fr_run) 326 ) TYPE=InnoDB''') 327 328 # COLUMNS: 329 # 'event' is the unique event id from the event_desc table 330 # 'run' is simulation run id that this event took place in 331 # 'tick' is the tick when the event happened 332 # 333 # INDEXES: 334 # 'event' is indexed so you can look up all occurences of a 335 # specific event 336 # 'run' is indexed so you can find all events in a run 337 # 'tick' is indexed because we want the unique thing anyway 338 # 'event,run,tick' is unique combination 339 self.query(''' 340 CREATE TABLE events( 341 ev_event SMALLINT UNSIGNED NOT NULL, 342 ev_run SMALLINT UNSIGNED NOT NULL, 343 ev_tick BIGINT UNSIGNED NOT NULL, 344 INDEX(ev_event), 345 INDEX(ev_run), 346 INDEX(ev_tick), 347 UNIQUE(ev_event,ev_run,ev_tick) 348 ) TYPE=InnoDB''') 349 350 # COLUMNS: 351 # 'id' is the unique description id 352 # 'name' is the name of the event that occurred 353 # 354 # INDEXES: 355 # 'id' is indexed because it is the primary key and is what you use 356 # to look up the descriptions 357 # 'name' is indexed so one can find the event based on name 358 # 359 self.query(''' 360 CREATE TABLE event_names( 361 en_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT, 362 en_name VARCHAR(255) NOT NULL, 363 PRIMARY KEY (en_id), 364 UNIQUE (en_name) 365 ) TYPE=InnoDB''') 366 367 def clean(self): 368 self.query(''' 369 DELETE data 370 FROM data 371 LEFT JOIN runs ON dt_run=rn_id 372 WHERE rn_id IS NULL''') 373 374 self.query(''' 375 DELETE formula_ref 376 FROM formula_ref 377 LEFT JOIN runs ON fr_run=rn_id 378 WHERE rn_id IS NULL''') 379 380 self.query(''' 381 DELETE formulas 382 FROM formulas 383 LEFT JOIN formula_ref ON fm_stat=fr_stat 384 WHERE fr_stat IS NULL''') 385 386 self.query(''' 387 DELETE stats 388 FROM stats 389 LEFT JOIN data ON st_id=dt_stat 390 WHERE dt_stat IS NULL''') 391 392 self.query(''' 393 DELETE subdata 394 FROM subdata 395 LEFT JOIN data ON sd_stat=dt_stat 396 WHERE dt_stat IS NULL''') 397 398 self.query(''' 399 DELETE bins 400 FROM bins 401 LEFT JOIN data ON bn_id=dt_bin 402 WHERE dt_bin IS NULL''') 403 404 self.query(''' 405 DELETE events 406 FROM events 407 LEFT JOIN runs ON ev_run=rn_id 408 WHERE rn_id IS NULL''') 409 410 self.query(''' 411 DELETE event_names 412 FROM event_names 413 LEFT JOIN events ON en_id=ev_event 414 WHERE ev_event IS NULL''') 415