dbinit.py revision 1049
1import MySQLdb
2
3class MyDB(object):
4    def __init__(self, options):
5        self.name = options.db
6        self.host = options.host
7        self.user = options.user
8        self.passwd = options.passwd
9        self.mydb = None
10        self.cursor = None
11
12    def admin(self):
13        self.close()
14        self.mydb = MySQLdb.connect(db='mysql', host=self.host, user=self.user,
15                                    passwd=self.passwd)
16        self.cursor = self.mydb.cursor()
17
18    def connect(self):
19        self.close()
20        self.mydb = MySQLdb.connect(db=self.name, host=self.host,
21                                    user=self.user, passwd=self.passwd)
22        self.cursor = self.mydb.cursor()
23
24    def close(self):
25        if self.mydb is not None:
26            self.mydb.close()
27        self.cursor = None
28
29    def query(self, sql):
30        self.cursor.execute(sql)
31
32    def drop(self):
33        self.query('DROP DATABASE IF EXISTS %s' % self.name)
34
35    def create(self):
36        self.query('CREATE DATABASE %s' % self.name)
37
38    def populate(self):
39        #
40        # Each run (or simulation) gets its own entry in the runs table to
41        # group stats by where they were generated
42        #
43        # COLUMNS:
44        #   'id' is a unique identifier for each run to be used in other
45        #       tables.
46        #   'name' is the user designated name for the data generated.  It is
47        #       configured in the simulator.
48        #   'user' identifies the user that generated the data for the given
49        #       run.
50        #   'project' another name to identify runs for a specific goal
51        #   'date' is a timestamp for when the data was generated.  It can be
52        #       used to easily expire data that was generated in the past.
53        #   'expire' is a timestamp for when the data should be removed from
54        #       the database so we don't have years worth of junk.
55        #
56        # INDEXES:
57        #   'run' is indexed so you can find out details of a run if the run
58        #       was retreived from the data table.
59        #   'name' is indexed so that two all run names are forced to be unique
60        #
61        self.query('''
62        CREATE TABLE runs(
63            rn_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
64            rn_name	VARCHAR(200)		NOT NULL,
65            rn_sample	VARCHAR(32)		NOT NULL,
66            rn_user	VARCHAR(32)		NOT NULL,
67            rn_project	VARCHAR(100)            NOT NULL,
68            rn_date	TIMESTAMP		NOT NULL,
69            rn_expire	TIMESTAMP               NOT NULL,
70            PRIMARY KEY (rn_id),
71            UNIQUE (rn_name,rn_sample)
72        ) TYPE=InnoDB''')
73
74        #
75        # We keep the bin names separate so that the data table doesn't get
76        # huge since bin names are frequently repeated.
77        #
78        # COLUMNS:
79        #   'id' is the unique bin identifer.
80        #   'name' is the string name for the bin.
81        #
82        # INDEXES:
83        #   'bin' is indexed to get the name of a bin when data is retrieved
84        #       via the data table.
85        #   'name' is indexed to get the bin id for a named bin when you want
86        #       to search the data table based on a specific bin.
87        #
88        self.query('''
89        CREATE TABLE bins(
90            bn_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
91            bn_name	VARCHAR(255)		NOT NULL,
92            PRIMARY KEY(bn_id),
93            UNIQUE (bn_name)
94        ) TYPE=InnoDB''')
95
96        #
97        # The stat table gives us all of the data for a particular stat.
98        #
99        # COLUMNS:
100        #   'stat' is a unique identifier for each stat to be used in other
101        #       tables for references.
102        #   'name' is simply the simulator derived name for a given
103        #       statistic.
104        #   'descr' is the description of the statistic and what it tells
105        #       you.
106        #   'type' defines what the stat tells you.  Types are:
107        #       SCALAR: A simple scalar statistic that holds one value
108        #       VECTOR: An array of statistic values.  Such a something that
109        #           is generated per-thread.  Vectors exist to give averages,
110        #	     pdfs, cdfs, means, standard deviations, etc across the
111        #           stat values.
112        #       DIST: Is a distribution of data.  When the statistic value is
113        #	     sampled, its value is counted in a particular bucket.
114        #           Useful for keeping track of utilization of a resource.
115        #           (e.g. fraction of time it is 25% used vs. 50% vs. 100%)
116        #       VECTORDIST: Can be used when the distribution needs to be
117        #	     factored out into a per-thread distribution of data for
118        #	     example.  It can still be summed across threads to find
119        #           the total distribution.
120        #       VECTOR2D: Can be used when you have a stat that is not only
121        #           per-thread, but it is per-something else.  Like
122        #           per-message type.
123        #       FORMULA: This statistic is a formula, and its data must be
124        #	     looked up in the formula table, for indicating how to
125        #           present its values.
126        #   'subdata' is potentially used by any of the vector types to
127        #       give a specific name to all of the data elements within a
128        #       stat.
129        #   'print' indicates whether this stat should be printed ever.
130        #       (Unnamed stats don't usually get printed)
131        #   'prereq' only print the stat if the prereq is not zero.
132        #   'prec' number of decimal places to print
133        #   'nozero' don't print zero values
134        #   'nonan' don't print NaN values
135        #   'total' for vector type stats, print the total.
136        #   'pdf' for vector type stats, print the pdf.
137        #   'cdf' for vector type stats, print the cdf.
138        #
139        #   The Following are for dist type stats:
140        #   'min' is the minimum bucket value. Anything less is an underflow.
141        #   'max' is the maximum bucket value. Anything more is an overflow.
142        #   'bktsize' is the approximate number of entries in each bucket.
143        #   'size' is the number of buckets. equal to (min/max)/bktsize.
144        #
145        # INDEXES:
146        #   'stat' is indexed so that you can find out details about a stat
147        #       if the stat id was retrieved from the data table.
148        #   'name' is indexed so that you can simply look up data about a
149        #       named stat.
150        #
151        self.query('''
152        CREATE TABLE stats(
153            st_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
154            st_name	VARCHAR(255)		NOT NULL,
155            st_descr	TEXT			NOT NULL,
156            st_type	ENUM("SCALAR", "VECTOR", "DIST", "VECTORDIST",
157                "VECTOR2D", "FORMULA")	NOT NULL,
158            st_print	BOOL			NOT NULL,
159            st_prereq	SMALLINT UNSIGNED	NOT NULL,
160            st_prec	TINYINT			NOT NULL,
161            st_nozero	BOOL			NOT NULL,
162            st_nonan	BOOL			NOT NULL,
163            st_total	BOOL			NOT NULL,
164            st_pdf	BOOL			NOT NULL,
165            st_cdf	BOOL			NOT NULL,
166            st_min	DOUBLE			NOT NULL,
167            st_max	DOUBLE			NOT NULL,
168            st_bktsize	DOUBLE			NOT NULL,
169            st_size	SMALLINT UNSIGNED	NOT NULL,
170            PRIMARY KEY (st_id),
171            UNIQUE (st_name)
172        ) TYPE=InnoDB''')
173
174        #
175        # This is the main table of data for stats.
176        #
177        # COLUMNS:
178        #   'stat' refers to the stat field given in the stat table.
179        #
180        #   'x' referrs to the first dimension of a multi-dimensional stat. For
181        #       a vector, x will start at 0 and increase for each vector
182        #       element.
183        #       For a distribution:
184        #       -1: sum (for calculating standard deviation)
185        #       -2: sum of squares (for calculating standard deviation)
186        #       -3: total number of samples taken (for calculating
187        #           standard deviation)
188        #       -4: minimum value
189        #       -5: maximum value
190        #       -6: underflow
191        #       -7: overflow
192        #   'y' is used by a VECTORDIST and the VECTOR2D to describe the second
193        #       dimension.
194        #   'run' is the run that the data was generated from.  Details up in
195        #       the run table
196        #   'tick' is a timestamp generated by the simulator.
197        #   'bin' is the name of the bin that the data was generated in, if
198        #       any.
199        #   'data' is the actual stat value.
200        #
201        # INDEXES:
202        #   'stat' is indexed so that a user can find all of the data for a
203        #       particular stat. It is not unique, because that specific stat
204        #       can be found in many runs, bins, and samples, in addition to
205        #       having entries for the mulidimensional cases.
206        #   'run' is indexed to allow a user to remove all of the data for a
207        #       particular execution run.  It can also be used to allow the
208        #       user to print out all of the data for a given run.
209        #
210        self.query('''
211        CREATE TABLE data(
212            dt_stat	SMALLINT UNSIGNED	NOT NULL,
213            dt_x	SMALLINT		NOT NULL,
214            dt_y	SMALLINT		NOT NULL,
215            dt_run	SMALLINT UNSIGNED	NOT NULL,
216            dt_tick	BIGINT UNSIGNED		NOT NULL,
217            dt_bin	SMALLINT UNSIGNED	NOT NULL,
218            dt_data	DOUBLE			NOT NULL,
219            INDEX (dt_stat),
220            INDEX (dt_run),
221            UNIQUE (dt_stat,dt_x,dt_y,dt_run,dt_tick,dt_bin)
222        ) TYPE=InnoDB;''')
223
224        #
225        # Names and descriptions for multi-dimensional stats (vectors, etc.)
226        # are stored here instead of having their own entry in the statistics
227        # table. This allows all parts of a single stat to easily share a
228        # single id.
229        #
230        # COLUMNS:
231        #   'stat' is the unique stat identifier from the stat table.
232        #   'x' is the first dimension for multi-dimensional stats
233        #       corresponding to the data table above.
234        #   'y' is the second dimension for multi-dimensional stats
235        #       corresponding to the data table above.
236        #   'name' is the specific subname for the unique stat,x,y combination.
237        #   'descr' is the specific description for the uniqe stat,x,y
238        #        combination.
239        #
240        # INDEXES:
241        #   'stat' is indexed so you can get the subdata for a specific stat.
242        #
243        self.query('''
244        CREATE TABLE subdata(
245            sd_stat	SMALLINT UNSIGNED	NOT NULL,
246            sd_x	SMALLINT		NOT NULL,
247            sd_y	SMALLINT		NOT NULL,
248            sd_name	VARCHAR(255)		NOT NULL,
249            sd_descr	TEXT,
250            UNIQUE (sd_stat,sd_x,sd_y)
251        ) TYPE=InnoDB''')
252
253
254        #
255        # The formula table is maintained separately from the data table
256        # because formula data, unlike other stat data cannot be represented
257        # there.
258        #
259        # COLUMNS:
260        #   'stat' refers to the stat field generated in the stat table.
261        #   'formula' is the actual string representation of the formula
262        #       itself.
263        #
264        # INDEXES:
265        #   'stat' is indexed so that you can just look up a formula.
266        #
267        self.query('''
268        CREATE TABLE formulas(
269            fm_stat	SMALLINT UNSIGNED	NOT NULL,
270            fm_formula	BLOB			NOT NULL,
271            PRIMARY KEY(fm_stat)
272        ) TYPE=InnoDB''')
273
274        #
275        # Each stat used in each formula is kept in this table.  This way, if
276        # you want to print out a particular formula, you can simply find out
277        # which stats you need by looking in this table.  Additionally, when
278        # you remove a stat from the stats table and data table, you remove
279        # any references to the formula in this table.  When a formula is no
280        # longer referred to, you remove its entry.
281        #
282        # COLUMNS:
283        #   'stat' is the stat id from the stat table above.
284        #   'child' is the stat id of a stat that is used for this formula.
285        #       There may be many children for any given 'stat' (formula)
286        #
287        # INDEXES:
288        #   'stat' is indexed so you can look up all of the children for a
289        #       particular stat.
290        #   'child' is indexed so that you can remove an entry when a stat is
291        #       removed.
292        #
293        self.query('''
294        CREATE TABLE formula_ref(
295            fr_stat	SMALLINT UNSIGNED	NOT NULL,
296            fr_run	SMALLINT UNSIGNED	NOT NULL,
297            UNIQUE (fr_stat,fr_run),
298            INDEX (fr_stat),
299            INDEX (fr_run)
300        ) TYPE=InnoDB''')
301
302        # COLUMNS:
303        #   'event' is the unique event id from the event_desc table
304        #   'run' is simulation run id that this event took place in
305        #   'tick' is the tick when the event happened
306        #
307        # INDEXES:
308        #   'event' is indexed so you can look up all occurences of a
309        #       specific event
310        #   'run' is indexed so you can find all events in a run
311        #   'tick' is indexed because we want the unique thing anyway
312        #   'event,run,tick' is unique combination
313        self.query('''
314        CREATE TABLE events(
315            ev_event	SMALLINT UNSIGNED	NOT NULL,
316            ev_run	SMALLINT UNSIGNED	NOT NULL,
317            ev_tick	BIGINT   UNSIGNED       NOT NULL,
318            INDEX(ev_event),
319            INDEX(ev_run),
320            INDEX(ev_tick),
321            UNIQUE(ev_event,ev_run,ev_tick)
322        ) TYPE=InnoDB''')
323
324        # COLUMNS:
325        #   'id' is the unique description id
326        #   'name' is the name of the event that occurred
327        #
328        # INDEXES:
329        #   'id' is indexed because it is the primary key and is what you use
330        #       to look up the descriptions
331        #   'name' is indexed so one can find the event based on name
332        #
333        self.query('''
334        CREATE TABLE event_names(
335            en_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
336            en_name	VARCHAR(255)		NOT NULL,
337            PRIMARY KEY (en_id),
338            UNIQUE (en_name)
339        ) TYPE=InnoDB''')
340
341    def clean(self):
342        self.query('''
343        DELETE data
344        FROM data
345        LEFT JOIN runs ON dt_run=rn_id
346        WHERE rn_id IS NULL''')
347
348        self.query('''
349        DELETE formula_ref
350        FROM formula_ref
351        LEFT JOIN runs ON fr_run=rn_id
352        WHERE rn_id IS NULL''')
353
354        self.query('''
355        DELETE formulas
356        FROM formulas
357        LEFT JOIN formula_ref ON fm_stat=fr_stat
358        WHERE fr_stat IS NULL''')
359
360        self.query('''
361        DELETE stats
362        FROM stats
363        LEFT JOIN data ON st_id=dt_stat
364        WHERE dt_stat IS NULL''')
365
366        self.query('''
367        DELETE subdata
368        FROM subdata
369        LEFT JOIN data ON sd_stat=dt_stat
370        WHERE dt_stat IS NULL''')
371
372        self.query('''
373        DELETE bins
374        FROM bins
375        LEFT JOIN data ON bn_id=dt_bin
376        WHERE dt_bin IS NULL''')
377
378        self.query('''
379        DELETE events
380        FROM events
381        LEFT JOIN runs ON ev_run=rn_id
382        WHERE rn_id IS NULL''')
383
384        self.query('''
385        DELETE event_names
386        FROM event_names
387        LEFT JOIN events ON en_id=ev_event
388        WHERE ev_event IS NULL''')
389