BmnRoot
Loading...
Searching...
No Matches
centrality_clusterization.py
Go to the documentation of this file.
1#!/usr/bin/env python
2# coding: utf-8
3# author: Nikolay Karpushkin
4# example: python3 centrality_clusterization.py -p /home/nikolay/BMN/tmp/7988_shortTreeVtx_mpd_run_Top.root -f fFHCal,fHodo -b 500 -n 10 -e 0.01 -m results/model.pkl
5
6import argparse
7import uproot
8import matplotlib.pyplot as plt
9import numpy as np
10import pandas as pd
11import seaborn as sns
12
13def load_data(input_root_file, feature_names):
14 # Load data from the ROOT file
15 events = uproot.open(input_root_file+":bmndata")
16 selected_events = {}
17 # selected_events = {
18 # "Trigger": events["BmnTrigInfo./BmnTrigInfo.fInputsAR"].array(library="np"),
19 # "BC1S": events["BmnTrigInfo./BmnTrigInfo.fBC1Integral"].array(library="np"),
20 # "VertexZ": events["PrimaryVertex./PrimaryVertex.fZ"].array(library="np"),
21 # "VertexNTracks": events["PrimaryVertex./PrimaryVertex.fNTracks"].array(library="np")
22 # }
23 for feat_name in feature_names:
24 selected_events[feat_name] = events[feat_name].array(library="np")
25 return pd.DataFrame(selected_events)
26
27def apply_selection(df):
28 # # Extracting individual bits
29 # bitBT = (df["Trigger"].apply(lambda x: (x >> 2) & 1)).astype(bool)
30 # bitMBT = (df["Trigger"].apply(lambda x: (x >> 5) & 1)).astype(bool)
31 # bitNiT = (df["Trigger"].apply(lambda x: (x >> 3) & 1)).astype(bool)
32 # bitCCT1 = (df["Trigger"].apply(lambda x: (x >> 4) & 1)).astype(bool)
33 # bitCCT2 = (df["Trigger"].apply(lambda x: (x >> 7) & 1)).astype(bool)
34
35 # # Forming boolean conditions
36 # trigBT = (bitBT == 1) & (bitMBT == 0) & (bitCCT1 == 0) & (bitCCT2 == 0)
37 # trigMBT = (bitMBT == 1) & (bitNiT == 0)
38 # trigCCT1 = (bitCCT1 == 1) & (bitMBT == 0) & (bitCCT2 == 0)
39 # trigCCT2 = (bitCCT2 == 1)
40
41 # # Applying selection based on boolean conditions
42 # df = df[trigMBT]
43 # df = df[
44 # (df["BC1S"] >= 14000) & (df["BC1S"] <= 40000) & # BC1S integral
45 # (abs(df["VertexZ"]) <= 1.5) & (df["VertexNTracks"] > 1) # Vertex conditions
46 # ]
47 return df.reset_index(drop=True)
48
49
50
51def compute_correlation_matrix(df, feature_names, nbins):
52 # Define the number of features
53 combinations = int(np.math.factorial(len(feature_names)) / (2 * np.math.factorial(len(feature_names) - 2)))
54
55 # Create a grid of subplots
56 fig, axes = plt.subplots(1, 1 + combinations, figsize=(8 * (1 + combinations), 8))
57
58 # Compute the correlation matrix
59 corr_matrix = df[feature_names].corr()
60 sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5, ax=axes[0])
61 axes[0].set_title('Feature Correlation Matrix')
62
63 counter = 1
64 for i, feature1 in enumerate(feature_names):
65 for j, feature2 in enumerate(feature_names):
66 if j <= i:
67 continue
68 ax = axes[counter]
69 ax.hist2d(df[feature1], df[feature2], bins=(nbins, nbins), cmap='jet', cmin=1, norm="log")
70 ax.set_xlabel(feature1)
71 ax.set_ylabel(feature2)
72 cbar = plt.colorbar(ax.collections[0], ax=ax)
73 cbar.set_label('Events', rotation=90)
74 counter += 1
75
76 plt.savefig('results/correlations.png')
77
78def normalize_features(df, feature_names):
79 # Determine the maximum value for each feature
80 max_values = df.max()
81 podgon = 1.7
82 max_values['fFHCal'] *= podgon
83
84 # Normalize each feature
85 for feat_name in feature_names:
86 df[f'normalized_{feat_name}'] = df[feat_name] / max_values[feat_name]
87 return df, max_values
88
89def set_plt_options():
90 # Set the default behavior for matplotlib
91 plt.rcParams['savefig.transparent'] = True
92 plt.rcParams['font.size'] = 15
93 plt.rcParams['figure.subplot.wspace'] = 0.3
94
95def plot_clusters(feature_names, clf, X, max_values):
96 # Define the number of pairwise combinations
97 combinations = int(np.math.factorial(len(feature_names)) / (2 * np.math.factorial(len(feature_names) - 2)))
98
99 # Create a grid of subplots
100 fig, axes = plt.subplots(1, 1 + combinations, figsize=(8 * (1 + combinations), 8))
101
102 # Compute the correlation matrix
103 fractions = [np.sum(clf.labels_ == k)/len(X) for k in range(clf.n_clusters)]
104 colors = [plt.cm.viridis(i) for i in np.linspace(0, 1, clf.n_clusters)]
105 counts = axes[0].bar(range(clf.n_clusters), fractions, color=colors)
106 axes[0].set_xlabel('Cluster')
107 axes[0].set_ylabel('Fraction of events')
108
109 counter = 1
110 for i, feature1 in enumerate(feature_names):
111 for j, feature2 in enumerate(feature_names):
112 if j <= i:
113 continue
114 ax = axes[counter]
115 ax.scatter(max_values[feature1] * X[:, i], max_values[feature2] * X[:, j], c=clf.labels_, s=0.1, cmap='viridis')
116 ax.scatter(max_values[feature1] * clf.cluster_centers_[:, i],max_values[feature2]*clf.cluster_centers_[:, j], marker='x', color='red', s=100)
117 ax.set_xlabel(feature1)
118 ax.set_ylabel(feature2)
119 counter += 1
120
121 plt.savefig('results/clusters.png')
122
123
124def create_pdfs(feature_names, clf, df, nbins):
125 from ROOT import TCanvas, gPad, TH1F, TFile
126
127 # Create canvas
128 c1 = TCanvas('c1', 'Probability density functions', 800*clf.n_clusters, 800*len(feature_names))
129 c1.Divide(clf.n_clusters, len(feature_names))
130
131 # Dictionary to store histograms
132 histos = {}
133
134 # Loop over each feature and cluster to create histograms
135 for feat_name in feature_names:
136 for cluster in range(clf.n_clusters):
137 hist_name = f"cluster_{cluster}/{feat_name}"
138 hist_title = hist_name
139 hist = TH1F(hist_name, hist_title, nbins, 0, df[feat_name].max())
140 histos[hist_name] = hist
141
142 # Fill histograms with data from DataFrame
143 for event in range(len(df)):
144 cluster = clf.labels_[event]
145 for feat_name in feature_names:
146 hist_name = f"cluster_{cluster}/{feat_name}"
147 hist = histos[hist_name]
148 hist.Fill(df[feat_name][event])
149
150 # Draw histograms on canvas
151 counter = 0
152 for feat_name in feature_names:
153 for cluster in range(clf.n_clusters):
154 c1.cd(counter+1)
155 gPad.SetLogy()
156 hist_name = f"cluster_{cluster}/{feat_name}"
157 hist = histos[hist_name]
158 hist.Scale(1./hist.Integral())
159 hist.Draw('N hist')
160 counter += 1
161
162 # Save canvas
163 c1.Draw()
164 c1.SaveAs("results/feature_pdfs.png")
165 c1.SaveAs("results/feature_pdfs_cc.root")
166
167 root_file = TFile("results/feature_pdfs.root", "RECREATE")
168 for hist_name in histos:
169 hist = histos[hist_name]
170 hist.Write()
171 root_file.Close()
172
173
174def main(input_root_file, feature_names, nbins, n_clusters, eps, load_model):
175 # Set the default behavior for matplotlib
176 set_plt_options()
177
178 df = load_data(input_root_file, feature_names)
179 df = apply_selection(df)
180 compute_correlation_matrix(df, feature_names, nbins)
181
182 # Normalize features
183 df, max_values = normalize_features(df, feature_names)
184 X = df[[f'normalized_{feat_name}' for feat_name in feature_names]].values
185
186 if load_model == "None":
187 from k_means_constrained import KMeansConstrained
188 import pickle
189
190 clf = KMeansConstrained(n_clusters=n_clusters, size_min=int(len(X)/n_clusters), size_max=int((1.0+eps)*len(X)/n_clusters), n_jobs=-1)
191 clf.fit_predict(X)
192
193 with open("results/model.pkl", "wb") as f:
194 pickle.dump(clf, f)
195 else:
196 import pickle
197 with open(load_model, "rb") as f:
198 clf = pickle.load(f)
199
200 # Create a grid of subplots
201 plot_clusters(feature_names, clf, X, max_values)
202
203 # Create probability density functions
204 create_pdfs(feature_names, clf, df, nbins)
205
206if __name__ == "__main__":
207 parser = argparse.ArgumentParser(description="Python script to clusterize data by their features into clusters of equal size")
208 parser.add_argument("-p", "--path", required=True, help="Path to the BMNROOT DST file")
209 parser.add_argument("-f", "--features", required=True, help="Feature names separated by commas (e.g. FHCalEvent/fTotalEnergy, HodoEvent/fTotalSignal)")
210 parser.add_argument("-b", "--nbins", type=int, required=True, help="Number of bins for histograms")
211 parser.add_argument("-n", "--nclusters", type=int, required=True, help="Number of clusters")
212 parser.add_argument("-e", "--eps", type=float, default=0.01, help="Value of fluctuation of statistics in clusters")
213 parser.add_argument("-m", "--model", default="None", help="Path to the pre-trained model")
214
215 args = parser.parse_args()
216 feature_names = args.features.split(",")
217
218 main(args.path, feature_names, args.nbins, args.nclusters, args.eps, args.model)
friend F32vec4 max(const F32vec4 &a, const F32vec4 &b)
Definition P4_F32vec4.h:31
int main()