dbinit.py revision 1049
1import MySQLdb 2 3class MyDB(object): 4 def __init__(self, options): 5 self.name = options.db 6 self.host = options.host 7 self.user = options.user 8 self.passwd = options.passwd 9 self.mydb = None 10 self.cursor = None 11 12 def admin(self): 13 self.close() 14 self.mydb = MySQLdb.connect(db='mysql', host=self.host, user=self.user, 15 passwd=self.passwd) 16 self.cursor = self.mydb.cursor() 17 18 def connect(self): 19 self.close() 20 self.mydb = MySQLdb.connect(db=self.name, host=self.host, 21 user=self.user, passwd=self.passwd) 22 self.cursor = self.mydb.cursor() 23 24 def close(self): 25 if self.mydb is not None: 26 self.mydb.close() 27 self.cursor = None 28 29 def query(self, sql): 30 self.cursor.execute(sql) 31 32 def drop(self): 33 self.query('DROP DATABASE IF EXISTS %s' % self.name) 34 35 def create(self): 36 self.query('CREATE DATABASE %s' % self.name) 37 38 def populate(self): 39 # 40 # Each run (or simulation) gets its own entry in the runs table to 41 # group stats by where they were generated 42 # 43 # COLUMNS: 44 # 'id' is a unique identifier for each run to be used in other 45 # tables. 46 # 'name' is the user designated name for the data generated. It is 47 # configured in the simulator. 48 # 'user' identifies the user that generated the data for the given 49 # run. 50 # 'project' another name to identify runs for a specific goal 51 # 'date' is a timestamp for when the data was generated. It can be 52 # used to easily expire data that was generated in the past. 53 # 'expire' is a timestamp for when the data should be removed from 54 # the database so we don't have years worth of junk. 55 # 56 # INDEXES: 57 # 'run' is indexed so you can find out details of a run if the run 58 # was retreived from the data table. 59 # 'name' is indexed so that two all run names are forced to be unique 60 # 61 self.query(''' 62 CREATE TABLE runs( 63 rn_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT, 64 rn_name VARCHAR(200) NOT NULL, 65 rn_sample VARCHAR(32) NOT NULL, 66 rn_user VARCHAR(32) NOT NULL, 67 rn_project VARCHAR(100) NOT NULL, 68 rn_date TIMESTAMP NOT NULL, 69 rn_expire TIMESTAMP NOT NULL, 70 PRIMARY KEY (rn_id), 71 UNIQUE (rn_name,rn_sample) 72 ) TYPE=InnoDB''') 73 74 # 75 # We keep the bin names separate so that the data table doesn't get 76 # huge since bin names are frequently repeated. 77 # 78 # COLUMNS: 79 # 'id' is the unique bin identifer. 80 # 'name' is the string name for the bin. 81 # 82 # INDEXES: 83 # 'bin' is indexed to get the name of a bin when data is retrieved 84 # via the data table. 85 # 'name' is indexed to get the bin id for a named bin when you want 86 # to search the data table based on a specific bin. 87 # 88 self.query(''' 89 CREATE TABLE bins( 90 bn_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT, 91 bn_name VARCHAR(255) NOT NULL, 92 PRIMARY KEY(bn_id), 93 UNIQUE (bn_name) 94 ) TYPE=InnoDB''') 95 96 # 97 # The stat table gives us all of the data for a particular stat. 98 # 99 # COLUMNS: 100 # 'stat' is a unique identifier for each stat to be used in other 101 # tables for references. 102 # 'name' is simply the simulator derived name for a given 103 # statistic. 104 # 'descr' is the description of the statistic and what it tells 105 # you. 106 # 'type' defines what the stat tells you. Types are: 107 # SCALAR: A simple scalar statistic that holds one value 108 # VECTOR: An array of statistic values. Such a something that 109 # is generated per-thread. Vectors exist to give averages, 110 # pdfs, cdfs, means, standard deviations, etc across the 111 # stat values. 112 # DIST: Is a distribution of data. When the statistic value is 113 # sampled, its value is counted in a particular bucket. 114 # Useful for keeping track of utilization of a resource. 115 # (e.g. fraction of time it is 25% used vs. 50% vs. 100%) 116 # VECTORDIST: Can be used when the distribution needs to be 117 # factored out into a per-thread distribution of data for 118 # example. It can still be summed across threads to find 119 # the total distribution. 120 # VECTOR2D: Can be used when you have a stat that is not only 121 # per-thread, but it is per-something else. Like 122 # per-message type. 123 # FORMULA: This statistic is a formula, and its data must be 124 # looked up in the formula table, for indicating how to 125 # present its values. 126 # 'subdata' is potentially used by any of the vector types to 127 # give a specific name to all of the data elements within a 128 # stat. 129 # 'print' indicates whether this stat should be printed ever. 130 # (Unnamed stats don't usually get printed) 131 # 'prereq' only print the stat if the prereq is not zero. 132 # 'prec' number of decimal places to print 133 # 'nozero' don't print zero values 134 # 'nonan' don't print NaN values 135 # 'total' for vector type stats, print the total. 136 # 'pdf' for vector type stats, print the pdf. 137 # 'cdf' for vector type stats, print the cdf. 138 # 139 # The Following are for dist type stats: 140 # 'min' is the minimum bucket value. Anything less is an underflow. 141 # 'max' is the maximum bucket value. Anything more is an overflow. 142 # 'bktsize' is the approximate number of entries in each bucket. 143 # 'size' is the number of buckets. equal to (min/max)/bktsize. 144 # 145 # INDEXES: 146 # 'stat' is indexed so that you can find out details about a stat 147 # if the stat id was retrieved from the data table. 148 # 'name' is indexed so that you can simply look up data about a 149 # named stat. 150 # 151 self.query(''' 152 CREATE TABLE stats( 153 st_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT, 154 st_name VARCHAR(255) NOT NULL, 155 st_descr TEXT NOT NULL, 156 st_type ENUM("SCALAR", "VECTOR", "DIST", "VECTORDIST", 157 "VECTOR2D", "FORMULA") NOT NULL, 158 st_print BOOL NOT NULL, 159 st_prereq SMALLINT UNSIGNED NOT NULL, 160 st_prec TINYINT NOT NULL, 161 st_nozero BOOL NOT NULL, 162 st_nonan BOOL NOT NULL, 163 st_total BOOL NOT NULL, 164 st_pdf BOOL NOT NULL, 165 st_cdf BOOL NOT NULL, 166 st_min DOUBLE NOT NULL, 167 st_max DOUBLE NOT NULL, 168 st_bktsize DOUBLE NOT NULL, 169 st_size SMALLINT UNSIGNED NOT NULL, 170 PRIMARY KEY (st_id), 171 UNIQUE (st_name) 172 ) TYPE=InnoDB''') 173 174 # 175 # This is the main table of data for stats. 176 # 177 # COLUMNS: 178 # 'stat' refers to the stat field given in the stat table. 179 # 180 # 'x' referrs to the first dimension of a multi-dimensional stat. For 181 # a vector, x will start at 0 and increase for each vector 182 # element. 183 # For a distribution: 184 # -1: sum (for calculating standard deviation) 185 # -2: sum of squares (for calculating standard deviation) 186 # -3: total number of samples taken (for calculating 187 # standard deviation) 188 # -4: minimum value 189 # -5: maximum value 190 # -6: underflow 191 # -7: overflow 192 # 'y' is used by a VECTORDIST and the VECTOR2D to describe the second 193 # dimension. 194 # 'run' is the run that the data was generated from. Details up in 195 # the run table 196 # 'tick' is a timestamp generated by the simulator. 197 # 'bin' is the name of the bin that the data was generated in, if 198 # any. 199 # 'data' is the actual stat value. 200 # 201 # INDEXES: 202 # 'stat' is indexed so that a user can find all of the data for a 203 # particular stat. It is not unique, because that specific stat 204 # can be found in many runs, bins, and samples, in addition to 205 # having entries for the mulidimensional cases. 206 # 'run' is indexed to allow a user to remove all of the data for a 207 # particular execution run. It can also be used to allow the 208 # user to print out all of the data for a given run. 209 # 210 self.query(''' 211 CREATE TABLE data( 212 dt_stat SMALLINT UNSIGNED NOT NULL, 213 dt_x SMALLINT NOT NULL, 214 dt_y SMALLINT NOT NULL, 215 dt_run SMALLINT UNSIGNED NOT NULL, 216 dt_tick BIGINT UNSIGNED NOT NULL, 217 dt_bin SMALLINT UNSIGNED NOT NULL, 218 dt_data DOUBLE NOT NULL, 219 INDEX (dt_stat), 220 INDEX (dt_run), 221 UNIQUE (dt_stat,dt_x,dt_y,dt_run,dt_tick,dt_bin) 222 ) TYPE=InnoDB;''') 223 224 # 225 # Names and descriptions for multi-dimensional stats (vectors, etc.) 226 # are stored here instead of having their own entry in the statistics 227 # table. This allows all parts of a single stat to easily share a 228 # single id. 229 # 230 # COLUMNS: 231 # 'stat' is the unique stat identifier from the stat table. 232 # 'x' is the first dimension for multi-dimensional stats 233 # corresponding to the data table above. 234 # 'y' is the second dimension for multi-dimensional stats 235 # corresponding to the data table above. 236 # 'name' is the specific subname for the unique stat,x,y combination. 237 # 'descr' is the specific description for the uniqe stat,x,y 238 # combination. 239 # 240 # INDEXES: 241 # 'stat' is indexed so you can get the subdata for a specific stat. 242 # 243 self.query(''' 244 CREATE TABLE subdata( 245 sd_stat SMALLINT UNSIGNED NOT NULL, 246 sd_x SMALLINT NOT NULL, 247 sd_y SMALLINT NOT NULL, 248 sd_name VARCHAR(255) NOT NULL, 249 sd_descr TEXT, 250 UNIQUE (sd_stat,sd_x,sd_y) 251 ) TYPE=InnoDB''') 252 253 254 # 255 # The formula table is maintained separately from the data table 256 # because formula data, unlike other stat data cannot be represented 257 # there. 258 # 259 # COLUMNS: 260 # 'stat' refers to the stat field generated in the stat table. 261 # 'formula' is the actual string representation of the formula 262 # itself. 263 # 264 # INDEXES: 265 # 'stat' is indexed so that you can just look up a formula. 266 # 267 self.query(''' 268 CREATE TABLE formulas( 269 fm_stat SMALLINT UNSIGNED NOT NULL, 270 fm_formula BLOB NOT NULL, 271 PRIMARY KEY(fm_stat) 272 ) TYPE=InnoDB''') 273 274 # 275 # Each stat used in each formula is kept in this table. This way, if 276 # you want to print out a particular formula, you can simply find out 277 # which stats you need by looking in this table. Additionally, when 278 # you remove a stat from the stats table and data table, you remove 279 # any references to the formula in this table. When a formula is no 280 # longer referred to, you remove its entry. 281 # 282 # COLUMNS: 283 # 'stat' is the stat id from the stat table above. 284 # 'child' is the stat id of a stat that is used for this formula. 285 # There may be many children for any given 'stat' (formula) 286 # 287 # INDEXES: 288 # 'stat' is indexed so you can look up all of the children for a 289 # particular stat. 290 # 'child' is indexed so that you can remove an entry when a stat is 291 # removed. 292 # 293 self.query(''' 294 CREATE TABLE formula_ref( 295 fr_stat SMALLINT UNSIGNED NOT NULL, 296 fr_run SMALLINT UNSIGNED NOT NULL, 297 UNIQUE (fr_stat,fr_run), 298 INDEX (fr_stat), 299 INDEX (fr_run) 300 ) TYPE=InnoDB''') 301 302 # COLUMNS: 303 # 'event' is the unique event id from the event_desc table 304 # 'run' is simulation run id that this event took place in 305 # 'tick' is the tick when the event happened 306 # 307 # INDEXES: 308 # 'event' is indexed so you can look up all occurences of a 309 # specific event 310 # 'run' is indexed so you can find all events in a run 311 # 'tick' is indexed because we want the unique thing anyway 312 # 'event,run,tick' is unique combination 313 self.query(''' 314 CREATE TABLE events( 315 ev_event SMALLINT UNSIGNED NOT NULL, 316 ev_run SMALLINT UNSIGNED NOT NULL, 317 ev_tick BIGINT UNSIGNED NOT NULL, 318 INDEX(ev_event), 319 INDEX(ev_run), 320 INDEX(ev_tick), 321 UNIQUE(ev_event,ev_run,ev_tick) 322 ) TYPE=InnoDB''') 323 324 # COLUMNS: 325 # 'id' is the unique description id 326 # 'name' is the name of the event that occurred 327 # 328 # INDEXES: 329 # 'id' is indexed because it is the primary key and is what you use 330 # to look up the descriptions 331 # 'name' is indexed so one can find the event based on name 332 # 333 self.query(''' 334 CREATE TABLE event_names( 335 en_id SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT, 336 en_name VARCHAR(255) NOT NULL, 337 PRIMARY KEY (en_id), 338 UNIQUE (en_name) 339 ) TYPE=InnoDB''') 340 341 def clean(self): 342 self.query(''' 343 DELETE data 344 FROM data 345 LEFT JOIN runs ON dt_run=rn_id 346 WHERE rn_id IS NULL''') 347 348 self.query(''' 349 DELETE formula_ref 350 FROM formula_ref 351 LEFT JOIN runs ON fr_run=rn_id 352 WHERE rn_id IS NULL''') 353 354 self.query(''' 355 DELETE formulas 356 FROM formulas 357 LEFT JOIN formula_ref ON fm_stat=fr_stat 358 WHERE fr_stat IS NULL''') 359 360 self.query(''' 361 DELETE stats 362 FROM stats 363 LEFT JOIN data ON st_id=dt_stat 364 WHERE dt_stat IS NULL''') 365 366 self.query(''' 367 DELETE subdata 368 FROM subdata 369 LEFT JOIN data ON sd_stat=dt_stat 370 WHERE dt_stat IS NULL''') 371 372 self.query(''' 373 DELETE bins 374 FROM bins 375 LEFT JOIN data ON bn_id=dt_bin 376 WHERE dt_bin IS NULL''') 377 378 self.query(''' 379 DELETE events 380 FROM events 381 LEFT JOIN runs ON ev_run=rn_id 382 WHERE rn_id IS NULL''') 383 384 self.query(''' 385 DELETE event_names 386 FROM event_names 387 LEFT JOIN events ON en_id=ev_event 388 WHERE ev_event IS NULL''') 389