11simulation_directory =
"/eos/nica/bmn/sim/gen"
12dst_directory =
"/eos/nica/bmn/sim/dst"
23 "dcm-qgsm":
"DCMQGSM",
27generator_list = list(name_to_generator.values())
62def getConfigFilePath():
63 return os.path.abspath(os.path.dirname(os.path.realpath(__file__)) +
"/../../build/config.sh")
66def printProgress(iteration, total, prefix = 'Progress:
', suffix = 'Complete
', percent_view = 1):
68 Call in a loop to create terminal progress bar
70 iteration - Required : current iteration (Int)
71 total - Required : total iterations (Int)
78 filledLength = int(length * iteration // total)
79 bar = fill * filledLength +
'-' * (length - filledLength)
81 percent = (
"{0:." + str(decimals) +
"f}").format(100 * (iteration / float(total)))
82 sys.stdout.write(
'\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix))
84 progress_iter = (
"{0} of {1}").format(iteration, total)
85 sys.stdout.write(
'\r%s |%s| %s %s' % (prefix, bar, progress_iter, suffix))
89def parse_name_and_write(filepath, file_type, file_name, separator, generator_type_dir, conn):
91 file_size = os.path.getsize(filepath)
92 logging.debug(
'file size: {0} MB'.format(file_size/1024.0/1024.0))
94 logging.error(
"File size is wrong: {0}".format(filepath))
99 file_tokens = file_name.split(separator)
103 generator_type_file =
""
104 for gen_name
in name_to_generator:
106 if gen_name
in file_tokens[token_num].lower():
107 generator_type_file = name_to_generator[gen_name]
108 if generator_type_dir
and generator_type_dir != generator_type_file:
112 if generator_list.index(generator_type_dir) < keys_list.index(generator_type_file):
113 generator_type_file = generator_type_dir
115 generator_type_dir = generator_type_file
116 else: logging.debug(
'generator type in file: {0}'.format(generator_type_file))
118 if token_num == len(file_tokens):
119 logging.error(
'unexpected end of the file name after generator definition: {0}'.format(filepath))
123 if not generator_type_file: generator_type_file = generator_type_dir
124 if not generator_type_file:
125 logging.error(
'generator type was not set for file: {0}'.format(filepath))
128 cur_file_type = generator_type_file
if file_type == 0
else 'root'
129 popen = subprocess.Popen(
". {0} > /dev/null; show_event_count {1} \"{2}\"".format(getConfigFilePath(), cur_file_type, filepath), stdout=subprocess.PIPE, shell=
True)
131 event_count = popen.stdout.read().decode(
"utf-8")
132 logging.debug(event_count)
133 if not event_count.isdigit():
134 logging.error(
"Event count not defined for file (format: {0}): {1}".format(cur_file_type, filepath))
138 i_event_count = int(event_count)
139 if i_event_count < 1:
140 logging.error(
"Event count is zero or less: {0}".format(filepath))
143 else: logging.debug(
'event count: {0}'.format(event_count))
147 a_file = open(filepath,
"rb")
148 file_content = a_file.read()
149 file_md5 = hashlib.md5(file_content).hexdigest()
150 except Exception
as e:
151 logging.error(
"ERROR while processing file '" + filepath +
"' (exception: " + str(e));
156 while not beam_target:
157 beam_target = re.search(
"^(?P<beam>({0}))(?P<target>({1}))$".format(
'|'.join(beam_to_particle.keys()),
'|'.join(target_to_particle.keys())), file_tokens[token_num].lower())
159 if token_num == len(file_tokens):
162 logging.error(
"Beam and Target were not found in the file name: {0}".format(filepath))
165 logging.debug(
'{0}-{1}'.format(beam_target.group(
'beam'),beam_target.group(
'target')))
166 if token_num == len(file_tokens):
167 logging.error(
'unexpected end of the file name after beam-target definition: {0}'.format(filepath))
169 beam = beam_to_particle[beam_target.group(
'beam')]
170 target = target_to_particle[beam_target.group(
'target')]
172 energy_gr = re.search(
"\d+\.?\d*", file_tokens[token_num])
174 logging.error(
"Energy was not found in the file name: {0} with token: {1}".format(filepath, file_tokens[token_num]))
177 energy = energy_gr.group()
179 if (separator ==
"."):
180 energy_gr = re.search(
"gev", file_tokens[token_num], re.IGNORECASE)
182 energy_gr = re.search(
r"\d+", file_tokens[token_num])
184 energy = energy +
"." + energy_gr.group()
186 logging.debug(
'energy: {0}'.format(energy))
188 if token_num == len(file_tokens):
189 logging.error(
'unexpected end of the file name after energy definition: {0}'.format(filepath))
192 centrality = file_tokens[token_num]
194 logging.error(
"Centrality was not found in the file name: {0}".format(filepath))
197 logging.debug(
'centrality: {0}'.format(centrality))
204 logging.info(
"\nINSERT INTO simulation_file(generator_name, file_type, file_path, beam_particle, target_particle, energy, centrality, event_count, file_size, file_hash) \
205 \nVALUES ('{0}', {1}, '{2}', '{3}', '{4}', {5}, '{6}', {7}, {8}, '{9}')".format(generator_type_file, file_type, filepath, beam, target, energy, centrality, event_count, file_size, file_md5))
207 cursor = conn.cursor()
208 cursor.execute(
"INSERT INTO simulation_file(generator_name, file_type, file_path, beam_particle, target_particle, energy, centrality, event_count, file_size, file_hash) \
209 VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (generator_type_file, file_type, filepath, beam, target, energy, centrality, event_count, file_size, file_md5))
215def recurse_path(dir_path, generator_type_dir, file_type, conn, existing_files, exist_validity):
216 dir_list = os.listdir(dir_path)
217 logging.debug(dir_list)
220 total_files = len([name
for name
in dir_list
if os.path.isfile(os.path.join(dir_path, name))]) - 1
221 base_dir_path = os.path.basename(dir_path)
222 for idx, file_name
in enumerate(dir_list):
223 filepath = os.path.join(dir_path, file_name)
226 if os.path.isfile(filepath):
229 for excl_extension
in exclude_extensions:
230 if filepath.endswith(excl_extension):
231 logging.debug(
'File was skipped because of the extension : {0}'.format(filepath))
238 sys.stdout.write(
'\n')
239 printProgress(idx, total_files, base_dir_path,
"files", 0)
242 if filepath
in existing_files:
243 exist_validity[existing_files.index(filepath)] = 1
246 logging.debug(
'{0}'.format(filepath))
247 logging.debug(
'generator_type_dir: {0}'.format(generator_type_dir))
249 file_name = os.path.splitext(file_name)[0]
250 result_code = parse_name_and_write(filepath, file_type, file_name,
"_", generator_type_dir, conn)
251 if result_code < 0:
continue
254 result_code = parse_name_and_write(filepath, file_type, file_name,
".", generator_type_dir, conn)
255 if result_code != 0:
continue
258 elif os.path.isdir(filepath):
259 for gen_name
in name_to_generator:
260 if gen_name
in file_name.lower():
261 temp_generator_type = name_to_generator[gen_name]
262 if not generator_type_dir
or \
263 (generator_list.index(generator_type_dir) > generator_list.index(temp_generator_type)):
264 generator_type_dir = temp_generator_type
266 recurse_path(filepath, generator_type_dir, file_type, conn, existing_files, exist_validity)
272simulation_directory = os.path.abspath(simulation_directory)
273dst_directory = os.path.abspath(dst_directory)
276logging.basicConfig(format=
'%(asctime)s %(message)s', datefmt=
'%Y-%m-%d %H:%M:%S', filename=
'sim_crawler.log', filemode=
'w', level=log_level)
279config = json.load(open(
"sim_crawler.json"))
283 conn = psycopg2.connect((
"dbname=%s user=%s host=%s password=%s") % (config[
"db_name"], config[
"db_user"], config[
"db_host"], config[
"db_pass"]))
284except Exception
as e:
285 logging.error(
"Connection Error: invalid connection parameters")
289cursor = conn.cursor()
290cursor.execute(
"SELECT file_path from simulation_file")
292existing_files_full = cursor.fetchall()
294existing_files = [x[0]
for x
in existing_files_full]
296exist_validity = [0] * len(existing_files)
302recurse_path(simulation_directory,
"", 0, conn, existing_files, exist_validity)
306recurse_path(dst_directory,
"", 1, conn, existing_files, exist_validity)
310val = len(exist_validity)
311for idx, val
in enumerate(exist_validity):
313 logging.error(
"\nDELETE FROM simulation_file WHERE file_path = {0}".format(existing_files[idx]))
314 cursor = conn.cursor()
315 cursor.execute(
"DELETE FROM simulation_file WHERE file_path = %s", (existing_files[idx], ))