BmnRoot
Loading...
Searching...
No Matches
sim_crawler.py
Go to the documentation of this file.
1# -*- coding: utf-8 -*-
2import os
3import sys
4import json
5import psycopg2
6import re
7import subprocess
8import logging
9import hashlib
10
11simulation_directory = "/eos/nica/bmn/sim/gen"
12dst_directory = "/eos/nica/bmn/sim/dst"
13
14# index defines the priority of the generator for the case of different generators in a path
15# NOTE: UNIGEN uses different generator names in a path, so there is no possibility to define the original one
16name_to_generator = {
17 "unigen": "UNIGEN",
18 "dcmsmm": "DCMSMM",
19 "dcm-smm": "DCMSMM",
20 "urqmd": "UrQMD",
21 "dqgsm": "DCMQGSM",
22 "dcmqgsm": "DCMQGSM",
23 "dcm-qgsm": "DCMQGSM",
24 "rqmd": "RQMD",
25 "jam": "RQMD"
26}
27generator_list = list(name_to_generator.values())
28
29beam_to_particle = {
30 "bi": "Bi",
31 "pb": "Pb",
32 "au": "Au",
33 "xe": "Xe",
34 "ag": "Ag",
35 "kr": "Kr",
36 "ar": "Ar",
37 "c" : "C",
38 "d" : "d",
39 "p" : "p"
40}
41
42target_to_particle = {
43 "csi": "CsI",
44 "pb": "Pb",
45 "au": "Au",
46 "cs": "Cs",
47 "xe": "Xe",
48 "sn": "Sn",
49 "ag": "Ag",
50 "cu": "Cu",
51 "al": "Al",
52 "c" : "C",
53 "p" : "p"
54}
55
56exclude_extensions = {
57 ".out"
58}
59
60log_level=logging.INFO #logging.DEBUG
61
62def getConfigFilePath():
63 return os.path.abspath(os.path.dirname(os.path.realpath(__file__)) + "/../../build/config.sh")
64
65# Print iterations progress
66def printProgress(iteration, total, prefix = 'Progress:', suffix = 'Complete', percent_view = 1):
67 """
68 Call in a loop to create terminal progress bar
69 @params:
70 iteration - Required : current iteration (Int)
71 total - Required : total iterations (Int)
72 """
73 decimals = 1 # positive number of decimals in percent complete
74 length = 25 # character length of bar
75 fill = '█' # bar fill character
76 iteration += 1
77 total += 1
78 filledLength = int(length * iteration // total)
79 bar = fill * filledLength + '-' * (length - filledLength)
80 if percent_view == 1:
81 percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
82 sys.stdout.write('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix))
83 else:
84 progress_iter = ("{0} of {1}").format(iteration, total)
85 sys.stdout.write('\r%s |%s| %s %s' % (prefix, bar, progress_iter, suffix))
86 sys.stdout.flush()
87
88# return 0 - success; <0 - error not concerned with the separator; >0 - error mau be concerned with the choice of the separator as argument
89def parse_name_and_write(filepath, file_type, file_name, separator, generator_type_dir, conn):
90 # get file size
91 file_size = os.path.getsize(filepath)
92 logging.debug('file size: {0} MB'.format(file_size/1024.0/1024.0))
93 if file_size <= 0:
94 logging.error("File size is wrong: {0}".format(filepath))
95 file_size = None
96 return -1
97
98 # split file name by separation symbol
99 file_tokens = file_name.split(separator)
100
101 token_num = 0
102 # parse generator name if exists
103 generator_type_file = ""
104 for gen_name in name_to_generator:
105 # check generator name contains in the file name
106 if gen_name in file_tokens[token_num].lower():
107 generator_type_file = name_to_generator[gen_name]
108 if generator_type_dir and generator_type_dir != generator_type_file:
109 #logging.error('generator types do not match for the directory and file name: {0}'.format(filepath))
110 #return -1
111 # Compare indices and assign the one with lower index to both variables
112 if generator_list.index(generator_type_dir) < keys_list.index(generator_type_file):
113 generator_type_file = generator_type_dir
114 else:
115 generator_type_dir = generator_type_file
116 else: logging.debug('generator type in file: {0}'.format(generator_type_file))
117 token_num += 1
118 if token_num == len(file_tokens):
119 logging.error('unexpected end of the file name after generator definition: {0}'.format(filepath))
120 return 1
121 break
122
123 if not generator_type_file: generator_type_file = generator_type_dir
124 if not generator_type_file:
125 logging.error('generator type was not set for file: {0}'.format(filepath))
126
127 # get event count via 'show_event_count' executable file (from the Condition Database interface)
128 cur_file_type = generator_type_file if file_type == 0 else 'root'
129 popen = subprocess.Popen(". {0} > /dev/null; show_event_count {1} \"{2}\"".format(getConfigFilePath(), cur_file_type, filepath), stdout=subprocess.PIPE, shell=True)
130 popen.wait()
131 event_count = popen.stdout.read().decode("utf-8")
132 logging.debug(event_count)
133 if not event_count.isdigit():
134 logging.error("Event count not defined for file (format: {0}): {1}".format(cur_file_type, filepath))
135 event_count = None
136 return -2
137 else:
138 i_event_count = int(event_count)
139 if i_event_count < 1:
140 logging.error("Event count is zero or less: {0}".format(filepath))
141 event_count = None
142 return -3
143 else: logging.debug('event count: {0}'.format(event_count))
144
145 # get file MD5 checksum
146 try:
147 a_file = open(filepath, "rb")
148 file_content = a_file.read()
149 file_md5 = hashlib.md5(file_content).hexdigest()
150 except Exception as e:
151 logging.error("ERROR while processing file '" + filepath + "' (exception: " + str(e));
152 return -4
153
154 # parse beam and target
155 beam_target = ""
156 while not beam_target:
157 beam_target = re.search("^(?P<beam>({0}))(?P<target>({1}))$".format('|'.join(beam_to_particle.keys()),'|'.join(target_to_particle.keys())), file_tokens[token_num].lower())
158 token_num += 1
159 if token_num == len(file_tokens):
160 break
161 if not beam_target:
162 logging.error("Beam and Target were not found in the file name: {0}".format(filepath))
163 return -5
164 else:
165 logging.debug('{0}-{1}'.format(beam_target.group('beam'),beam_target.group('target')))
166 if token_num == len(file_tokens):
167 logging.error('unexpected end of the file name after beam-target definition: {0}'.format(filepath))
168 return 2
169 beam = beam_to_particle[beam_target.group('beam')]
170 target = target_to_particle[beam_target.group('target')]
171
172 energy_gr = re.search("\d+\.?\d*", file_tokens[token_num])
173 if not energy_gr:
174 logging.error("Energy was not found in the file name: {0} with token: {1}".format(filepath, file_tokens[token_num]))
175 return 3
176 else:
177 energy = energy_gr.group()
178 token_num += 1
179 if (separator == "."): # if separator is dot then it can divide a float value of the energy
180 energy_gr = re.search("gev", file_tokens[token_num], re.IGNORECASE)
181 if energy_gr:
182 energy_gr = re.search(r"\d+", file_tokens[token_num])
183 if energy_gr:
184 energy = energy + "." + energy_gr.group()
185 token_num += 1
186 logging.debug('energy: {0}'.format(energy))
187
188 if token_num == len(file_tokens):
189 logging.error('unexpected end of the file name after energy definition: {0}'.format(filepath))
190 return 4
191
192 centrality = file_tokens[token_num]
193 if not centrality:
194 logging.error("Centrality was not found in the file name: {0}".format(filepath))
195 return 5
196 else:
197 logging.debug('centrality: {0}'.format(centrality))
198 token_num += 1
199
200 # replace 'eos' by 'eos/eos.jinr.ru' to form file_path_micc
201 #filepath_micc = filepath
202 #filepath_micc.replace("eos", "eos/eos.jinr.ru", 1)
203
204 logging.info("\nINSERT INTO simulation_file(generator_name, file_type, file_path, beam_particle, target_particle, energy, centrality, event_count, file_size, file_hash) \
205 \nVALUES ('{0}', {1}, '{2}', '{3}', '{4}', {5}, '{6}', {7}, {8}, '{9}')".format(generator_type_file, file_type, filepath, beam, target, energy, centrality, event_count, file_size, file_md5))
206 # insert new file into the Database
207 cursor = conn.cursor()
208 cursor.execute("INSERT INTO simulation_file(generator_name, file_type, file_path, beam_particle, target_particle, energy, centrality, event_count, file_size, file_hash) \
209 VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (generator_type_file, file_type, filepath, beam, target, energy, centrality, event_count, file_size, file_md5))
210 conn.commit()
211 cursor.close()
212 return 0
213
214# Recursive processing the simulation directory
215def recurse_path(dir_path, generator_type_dir, file_type, conn, existing_files, exist_validity):
216 dir_list = os.listdir(dir_path)
217 logging.debug(dir_list)
218
219 first_file = 1
220 total_files = len([name for name in dir_list if os.path.isfile(os.path.join(dir_path, name))]) - 1
221 base_dir_path = os.path.basename(dir_path)
222 for idx, file_name in enumerate(dir_list):
223 filepath = os.path.join(dir_path, file_name)
224
225 # whether it is a normal file
226 if os.path.isfile(filepath):
227 # Check if the file has a correct extension
228 isSkip = False
229 for excl_extension in exclude_extensions:
230 if filepath.endswith(excl_extension):
231 logging.debug('File was skipped because of the extension : {0}'.format(filepath))
232 isSkip = True
233 break
234 if isSkip: continue
235
236 if first_file == 1:
237 first_file = 0
238 sys.stdout.write('\n') # Print New Line
239 printProgress(idx, total_files, base_dir_path, "files", 0)
240
241 # find file in the list obtained from the Database: if exists, then skip
242 if filepath in existing_files:
243 exist_validity[existing_files.index(filepath)] = 1
244 continue
245
246 logging.debug('{0}'.format(filepath))
247 logging.debug('generator_type_dir: {0}'.format(generator_type_dir))
248 # remove extension
249 file_name = os.path.splitext(file_name)[0]
250 result_code = parse_name_and_write(filepath, file_type, file_name, "_", generator_type_dir, conn) # both filepath and file_name to exclude duplication of operations
251 if result_code < 0: continue
252 # try to use '.' separator for parsing the file name
253 if result_code > 0:
254 result_code = parse_name_and_write(filepath, file_type, file_name, ".", generator_type_dir, conn)
255 if result_code != 0: continue
256
257 # whether it is a directory
258 elif os.path.isdir(filepath):
259 for gen_name in name_to_generator:
260 if gen_name in file_name.lower():
261 temp_generator_type = name_to_generator[gen_name]
262 if not generator_type_dir or \
263 (generator_list.index(generator_type_dir) > generator_list.index(temp_generator_type)):
264 generator_type_dir = temp_generator_type
265 break
266 recurse_path(filepath, generator_type_dir, file_type, conn, existing_files, exist_validity)
267
268 return 0
269
270
271# MAIN FUNCTION
272simulation_directory = os.path.abspath(simulation_directory)
273dst_directory = os.path.abspath(dst_directory)
274
275# create log file
276logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', filename='sim_crawler.log', filemode='w', level=log_level)
277
278# load JSON configuration
279config = json.load(open("sim_crawler.json"))
280
281# connect to the Unified Condition Database
282try:
283 conn = psycopg2.connect(("dbname=%s user=%s host=%s password=%s") % (config["db_name"], config["db_user"], config["db_host"], config["db_pass"]))
284except Exception as e:
285 logging.error("Connection Error: invalid connection parameters")
286 logging.error(e)
287
288# select all simulation files from the Database
289cursor = conn.cursor()
290cursor.execute("SELECT file_path from simulation_file")
291conn.commit()
292existing_files_full = cursor.fetchall()
293cursor.close()
294existing_files = [x[0] for x in existing_files_full]
295
296exist_validity = [0] * len(existing_files)
297
298#for row in existing_files:
299#print(existing_files[0])
300
301# process all files in simulation directory
302recurse_path(simulation_directory, "", 0, conn, existing_files, exist_validity)
303print()
304
305# process all files in simulation DST directory
306recurse_path(dst_directory, "", 1, conn, existing_files, exist_validity)
307print()
308
309# delete files from the Database which have not been found
310val = len(exist_validity)
311for idx, val in enumerate(exist_validity):
312 if val != 1:
313 logging.error("\nDELETE FROM simulation_file WHERE file_path = {0}".format(existing_files[idx]))
314 cursor = conn.cursor()
315 cursor.execute("DELETE FROM simulation_file WHERE file_path = %s", (existing_files[idx], ))
316 conn.commit()
317 cursor.close()
318
319# close connection to the Database
320conn.close()