BmnRoot
Loading...
Searching...
No Matches
file_size.py
Go to the documentation of this file.
1import argparse
2import matplotlib.pyplot as plt
3import numpy as np
4import os
5import psycopg2
6import re
7import uproot
8
9import file_size.config as config
10
11from exceptions import NoDataException
12
13
14class SizeStatComputer:
15 def __init__(self, config_dict):
16
17 self.EXTENSIONS = config_dict.get('extensions')
18 self.DB_USER = config_dict.get('db_user')
19 self.DB_PASS = config_dict.get('db_pass')
20 self.DB_NAME = config_dict.get('db_name')
21 self.DB_HOST = config_dict.get('db_host')
22
23 self.DPI = config_dict.get('dpi', config.DPI)
24 self.FOLDERS_IGNORE = config.FOLDERS_IGNORE
25 self.FOLDERS_IGNORE.extend(config_dict.get('folders_ignore', config.FOLDERS_IGNORE))
26
27 self.FILE_SIZE_LIMIT_LOW, self.FILE_SIZE_LIMIT_HIGH = self.extract_size_limitsextract_size_limits(config_dict.get('file_size_limit'))
28 self.EVENT_SIZE_LIMIT_LOW, self.EVENT_SIZE_LIMIT_HIGH = self.extract_size_limitsextract_size_limits(config_dict.get('event_size_limit'))
29
30 self.SOURCE = (config_dict.get('source') or "exp").lower() # exp or sim
31
32
33 def extract_size_limits(self, limit_str):
34 if limit_str is None or limit_str == "":
35 return None, None
36 units = {"": 1, "B": 1, "KB": 1024, "MB": 1024*1024, "GB": 1024*1024*1024, "TB": 1024*1024*1024*1024}
37 try:
38 res = re.search("(\d+)(\w*)\s*:\s*(\d+)(\w*)", limit_str.strip())
39 gr = res.groups()
40 if len(gr) != 4:
41 raise Exception("Wrong string specifying file size limits")
42 limit_min = int(gr[0])
43 units_min = gr[1].upper()
44 limit_max = int(gr[2])
45 units_max = gr[3].upper()
46 if units_min not in units or units_max not in units:
47 raise Exception("Wrong string specifying file size limits")
48 return limit_min * units[units_min], limit_max * units[units_max]
49 except:
50 print(f"\n\nWarning: Could not extract size limits from {limit_str}")
51 return None, None
52
53
54 def compute(self, _dir, recursive):
55 arr, arr_per_event = self.parse_dirparse_dir(_dir, recursive)
56
57 arr, unit = self.convert_unitsconvert_units(arr)
58 title = f"File size, {unit}. Mean = {np.mean(arr):.3f} {unit}. Overall {len(arr)} files."
59
60 print()
61 print("Obtained characteristics:")
62 print(f" File statistics: min = {np.min(arr):.3f} {unit}, avg = {np.mean(arr):.3f} {unit}, " \
63 f"max={np.max(arr):.3f} {unit}, summary={np.sum(arr):.3f} {unit}")
64
65 arr_per_event, unit_per_event = self.convert_unitsconvert_units(arr_per_event)
66 title_per_event = f"File size per event, {unit_per_event}. Mean = {np.mean(arr_per_event):.3f} {unit_per_event}. " \
67 f"Overall {len(arr_per_event)} files."
68
69 if len(arr_per_event) == 0:
70 print("There is no additional statistics on individual events")
71 else:
72 print(f" File statistics per event: min = {np.min(arr_per_event):.3f} {unit_per_event}, " +
73 f"avg = {np.mean(arr_per_event):.3f} {unit_per_event}, max={np.max(arr_per_event):.3f} {unit_per_event}")
74
75 return (arr, unit, title, arr_per_event, unit_per_event, title_per_event)
76
77
78 def parse_dir(self, _dir, recursive):
79 filesize_arr = []
80 filesize_per_event = []
81
82 if recursive:
83 files_to_walk = os.walk(_dir)
84 else:
85 files_to_walk = [next(os.walk(_dir))]
86
87 files_parsed_successful = 0
88 files_parsed_overall = 0
89 unsuccessful_list = []
90 for root, dirs, files in files_to_walk:
91 for file in files:
92 if self.is_file_to_parseis_file_to_parse(root, file):
93 files_parsed_overall += 1
94 file_path = os.path.join(root, file)
95 filesize_bytes = os.stat(file_path).st_size
96 if self.FILE_SIZE_LIMIT_LOW is not None and self.FILE_SIZE_LIMIT_HIGH is not None:
97 if filesize_bytes < self.FILE_SIZE_LIMIT_LOW or filesize_bytes > self.FILE_SIZE_LIMIT_HIGH:
98 filesize_conv, filesize_units = self.convert_units_scalarconvert_units_scalar(filesize_bytes)
99 print(f"\nFile {file_path} is {filesize_conv:.1f} {filesize_units} "\
100 f"which does not meet file size limit - skipping.")
101 unsuccessful_list.append(file_path)
102 continue
103
104 if self.SOURCE == "exp":
105 # CALCULATE EVENT SIZE FOR EXTRACTED RUN NUMBER
106 #run_number = re.search(config.RUN_NUM_REGEX, file)
107 #if run_number is None:
108 # print(f"\nNo run number found in filename name for experimental file {file_path}")
109 # unsuccessful_list.append(file_path)
110 # event_count = None
111 #else:
112 # run_number = run_number.group()
113 # event_count = self.get_event_count_by_run(run_number)
114 # CALCULATE EVENT SIZE FOR RAW FILENAME
115 event_count = self.get_event_count_rawget_event_count_raw(file_path)
116 else: # "sim"
117 event_count = self.get_event_count_simget_event_count_sim(file_path)
118 # print(f"Got {run_count} events in simulation file {file_path}")
119
120 if event_count is None:
121 print(f"\nNo event count found in the database for file {file_path}")
122 unsuccessful_list.append(file_path)
123 #continue
124
125 file_ext = os.path.splitext(file_path)[1]
126 if file_ext == ".root":
127 uproot_count = self.uproot_event_countuproot_event_count(file_path)
128 # if None, file is probably not a root file
129 if uproot_count != None and event_count != None and uproot_count != event_count:
130 print(f"\nFile {file_path} has {uproot_count} events but the database reports {event_count} events - skipping...")
131 unsuccessful_list.append(file_path)
132 continue
133
134 if (event_count is not None) and (event_count != 0):
135 filesize_bytes_per_event = filesize_bytes / event_count
136 if self.EVENT_SIZE_LIMIT_LOW is not None and self.EVENT_SIZE_LIMIT_HIGH is not None:
137 if filesize_bytes_per_event < self.EVENT_SIZE_LIMIT_LOW or filesize_bytes_per_event > self.EVENT_SIZE_LIMIT_HIGH:
138 eventsize_conv, eventsize_units = self.convert_units_scalarconvert_units_scalar(filesize_bytes_per_event)
139 print(f"\nFile {file_path} has {eventsize_conv:.1f} {eventsize_units} per event "\
140 f"which does not meet event size limit - skipping.")
141 unsuccessful_list.append(file_path)
142 continue
143 filesize_per_event.append(filesize_bytes_per_event)
144 files_parsed_successful += 1
145 print("+", end="", flush=True)
146 filesize_arr.append(filesize_bytes)
147
148 print()
149 print(f"Total files parsed: {files_parsed_successful}")
150 if filesize_arr == []:
151 raise NoDataException
152 unsuccessful_list.sort()
153 if len(unsuccessful_list) == 0:
154 print("\nAll files processed successfully.\n")
155 else:
156 print("\nUnsuccessfully processed files:")
157 for elem in unsuccessful_list:
158 print(elem)
159 if config.UNSUCCESSFUL_LOG_FILE is not None:
160 print()
161 with open(config.UNSUCCESSFUL_LOG_FILE, "wt") as f:
162 for elem in unsuccessful_list:
163 f.write(elem + "\n")
164 print(f"Unsuccessfully processed files list ({len(unsuccessful_list)}/{files_parsed_overall}, {(100*len(unsuccessful_list)/files_parsed_overall):.1f}%)"\
165 f" was saved to {config.UNSUCCESSFUL_LOG_FILE}\n")
166 return np.array(filesize_arr), np.array(filesize_per_event)
167
168
169 def is_file_to_parse(self, root, file):
170 # filepath = os.path.join(root, file)
171 correct_ext = any([file.endswith(ext) for ext in self.EXTENSIONS]) or ("*" in self.EXTENSIONS)
172 correct_folder = all([elem not in os.path.join(root, file) for elem in self.FOLDERS_IGNORE])
173 return correct_ext and correct_folder
174
175
176 def convert_units(self, arr):
177 mean = np.mean(arr)
178 for i, unit in enumerate(config.UNITS):
179 if mean / config.SIZE**i < config.SIZE:
180 break
181 arr = arr / (config.SIZE**i)
182 return arr, unit
183
184
185 def convert_units_scalar(self, num):
186 for i, unit in enumerate(config.UNITS):
187 if num / config.SIZE**i < config.SIZE:
188 break
189 res = num / (config.SIZE**i)
190 return res, unit
191
192
193 def get_event_count_by_run(self, run_num):
194 conn = psycopg2.connect(dbname=self.DB_NAME, user=self.DB_USER,
195 password=self.DB_PASS, host=self.DB_HOST)
196 cursor = conn.cursor()
197 cursor.execute(f'SELECT event_count FROM run_ WHERE run_number = {run_num}')
198 count = cursor.fetchone()
199 if count is None:
200 return None
201 count = count[0]
202 cursor.close()
203 conn.close()
204 return count
205
206 def get_event_count_raw(self, file_path):
207 conn = psycopg2.connect(dbname=self.DB_NAME, user=self.DB_USER,
208 password=self.DB_PASS, host=self.DB_HOST)
209 cursor = conn.cursor()
210 raw_file_name = os.path.basename(file_path)
211 #cursor.execute(f"SELECT event_count FROM raw_file WHERE regexp_replace(file_path, '^.+[/\\\\]', '') = '{raw_file_name}'")
212 cursor.execute(f"SELECT event_count FROM raw_file WHERE file_path LIKE '%/'||'{raw_file_name}' OR file_path = '{raw_file_name}'")
213 rows = cursor.fetchall()
214
215 count = None
216 if len(rows) == 0:
217 1 #print(f"\nUniConDa: no record found for filename: {raw_file_name}")
218 elif len(rows) == 1:
219 count = rows[0][0] # print("Exactly one match:", count)
220 else:
221 print(f"\nUniConDa: multiple records ({len(rows)}) found for filename: {raw_file_name}")
222 cursor.close()
223 conn.close()
224 return count
225
226 def get_event_count_sim(self, file_path):
227 conn = psycopg2.connect(dbname=self.DB_NAME, user=self.DB_USER,
228 password=self.DB_PASS, host=self.DB_HOST)
229 cursor = conn.cursor()
230 cursor.execute(f"SELECT event_count FROM simulation_file WHERE file_path='{file_path}'")
231 count = cursor.fetchone()
232 if count is not None:
233 count = count[0]
234 cursor.close()
235 conn.close()
236 return count
237
238
239 def uproot_event_count(self, file_path):
240 try:
241 r = uproot.open(file_path)
242 except Exception:
243 print(f"\nError occured while opening ROOT file: {file_path}")
244 return None
245 bmndata = list(filter(lambda x: x.startswith("bmndata;"), r.keys()))
246 if len(bmndata) not in [1, 2]:
247 return None
248 second_num = sorted(list(map(lambda s: int(s[8:]), bmndata)))[-1]
249 return r['bmndata;' + str(second_num)].member('fEntries') - 1
uproot_event_count(self, file_path)
Definition file_size.py:239
is_file_to_parse(self, root, file)
Definition file_size.py:169
compute(self, _dir, recursive)
Definition file_size.py:54
extract_size_limits(self, limit_str)
Definition file_size.py:33
get_event_count_raw(self, file_path)
Definition file_size.py:206
get_event_count_sim(self, file_path)
Definition file_size.py:226
convert_units_scalar(self, num)
Definition file_size.py:185
__init__(self, config_dict)
Definition file_size.py:15
get_event_count_by_run(self, run_num)
Definition file_size.py:193
parse_dir(self, _dir, recursive)
Definition file_size.py:78