8import matplotlib.pyplot
as plt
13def load_data(input_root_file, feature_names):
15 events = uproot.open(input_root_file+
":bmndata")
23 for feat_name
in feature_names:
24 selected_events[feat_name] = events[feat_name].array(library=
"np")
25 return pd.DataFrame(selected_events)
27def apply_selection(df):
47 return df.reset_index(drop=
True)
51def compute_correlation_matrix(df, feature_names, nbins):
53 combinations = int(np.math.factorial(len(feature_names)) / (2 * np.math.factorial(len(feature_names) - 2)))
56 fig, axes = plt.subplots(1, 1 + combinations, figsize=(8 * (1 + combinations), 8))
59 corr_matrix = df[feature_names].corr()
60 sns.heatmap(corr_matrix, annot=
True, cmap=
'coolwarm', fmt=
".2f", linewidths=0.5, ax=axes[0])
61 axes[0].set_title(
'Feature Correlation Matrix')
64 for i, feature1
in enumerate(feature_names):
65 for j, feature2
in enumerate(feature_names):
69 ax.hist2d(df[feature1], df[feature2], bins=(nbins, nbins), cmap=
'jet', cmin=1, norm=
"log")
70 ax.set_xlabel(feature1)
71 ax.set_ylabel(feature2)
72 cbar = plt.colorbar(ax.collections[0], ax=ax)
73 cbar.set_label(
'Events', rotation=90)
76 plt.savefig(
'results/correlations.png')
78def normalize_features(df, feature_names):
82 max_values[
'fFHCal'] *= podgon
85 for feat_name
in feature_names:
86 df[f
'normalized_{feat_name}'] = df[feat_name] / max_values[feat_name]
91 plt.rcParams[
'savefig.transparent'] =
True
92 plt.rcParams[
'font.size'] = 15
93 plt.rcParams[
'figure.subplot.wspace'] = 0.3
95def plot_clusters(feature_names, clf, X, max_values):
97 combinations = int(np.math.factorial(len(feature_names)) / (2 * np.math.factorial(len(feature_names) - 2)))
100 fig, axes = plt.subplots(1, 1 + combinations, figsize=(8 * (1 + combinations), 8))
103 fractions = [np.sum(clf.labels_ == k)/len(X)
for k
in range(clf.n_clusters)]
104 colors = [plt.cm.viridis(i)
for i
in np.linspace(0, 1, clf.n_clusters)]
105 counts = axes[0].bar(range(clf.n_clusters), fractions, color=colors)
106 axes[0].set_xlabel(
'Cluster')
107 axes[0].set_ylabel(
'Fraction of events')
110 for i, feature1
in enumerate(feature_names):
111 for j, feature2
in enumerate(feature_names):
115 ax.scatter(max_values[feature1] * X[:, i], max_values[feature2] * X[:, j], c=clf.labels_, s=0.1, cmap=
'viridis')
116 ax.scatter(max_values[feature1] * clf.cluster_centers_[:, i],max_values[feature2]*clf.cluster_centers_[:, j], marker=
'x', color=
'red', s=100)
117 ax.set_xlabel(feature1)
118 ax.set_ylabel(feature2)
121 plt.savefig(
'results/clusters.png')
124def create_pdfs(feature_names, clf, df, nbins):
125 from ROOT
import TCanvas, gPad, TH1F, TFile
128 c1 = TCanvas(
'c1',
'Probability density functions', 800*clf.n_clusters, 800*len(feature_names))
129 c1.Divide(clf.n_clusters, len(feature_names))
135 for feat_name
in feature_names:
136 for cluster
in range(clf.n_clusters):
137 hist_name = f
"cluster_{cluster}/{feat_name}"
138 hist_title = hist_name
139 hist = TH1F(hist_name, hist_title, nbins, 0, df[feat_name].
max())
140 histos[hist_name] = hist
143 for event
in range(len(df)):
144 cluster = clf.labels_[event]
145 for feat_name
in feature_names:
146 hist_name = f
"cluster_{cluster}/{feat_name}"
147 hist = histos[hist_name]
148 hist.Fill(df[feat_name][event])
152 for feat_name
in feature_names:
153 for cluster
in range(clf.n_clusters):
156 hist_name = f
"cluster_{cluster}/{feat_name}"
157 hist = histos[hist_name]
158 hist.Scale(1./hist.Integral())
164 c1.SaveAs(
"results/feature_pdfs.png")
165 c1.SaveAs(
"results/feature_pdfs_cc.root")
167 root_file = TFile(
"results/feature_pdfs.root",
"RECREATE")
168 for hist_name
in histos:
169 hist = histos[hist_name]
174def main(input_root_file, feature_names, nbins, n_clusters, eps, load_model):
178 df = load_data(input_root_file, feature_names)
179 df = apply_selection(df)
180 compute_correlation_matrix(df, feature_names, nbins)
183 df, max_values = normalize_features(df, feature_names)
184 X = df[[f
'normalized_{feat_name}' for feat_name
in feature_names]].values
186 if load_model ==
"None":
187 from k_means_constrained
import KMeansConstrained
190 clf = KMeansConstrained(n_clusters=n_clusters, size_min=int(len(X)/n_clusters), size_max=int((1.0+eps)*len(X)/n_clusters), n_jobs=-1)
193 with open(
"results/model.pkl",
"wb")
as f:
197 with open(load_model,
"rb")
as f:
201 plot_clusters(feature_names, clf, X, max_values)
204 create_pdfs(feature_names, clf, df, nbins)
206if __name__ ==
"__main__":
207 parser = argparse.ArgumentParser(description=
"Python script to clusterize data by their features into clusters of equal size")
208 parser.add_argument(
"-p",
"--path", required=
True, help=
"Path to the BMNROOT DST file")
209 parser.add_argument(
"-f",
"--features", required=
True, help=
"Feature names separated by commas (e.g. FHCalEvent/fTotalEnergy, HodoEvent/fTotalSignal)")
210 parser.add_argument(
"-b",
"--nbins", type=int, required=
True, help=
"Number of bins for histograms")
211 parser.add_argument(
"-n",
"--nclusters", type=int, required=
True, help=
"Number of clusters")
212 parser.add_argument(
"-e",
"--eps", type=float, default=0.01, help=
"Value of fluctuation of statistics in clusters")
213 parser.add_argument(
"-m",
"--model", default=
"None", help=
"Path to the pre-trained model")
215 args = parser.parse_args()
216 feature_names = args.features.split(
",")
218 main(args.path, feature_names, args.nbins, args.nclusters, args.eps, args.model)
friend F32vec4 max(const F32vec4 &a, const F32vec4 &b)