53 combinations =
int(np.math.factorial(len(feature_names)) / (2 * np.math.factorial(len(feature_names) - 2)))
56 fig, axes = plt.subplots(1, 1 + combinations, figsize=(8 * (1 + combinations), 8))
59 corr_matrix = df[feature_names].corr()
60 sns.heatmap(corr_matrix, annot=
True, cmap=
'coolwarm', fmt=
".2f", linewidths=0.5, ax=axes[0])
61 axes[0].set_title(
'Feature Correlation Matrix')
64 for i, feature1
in enumerate(feature_names):
65 for j, feature2
in enumerate(feature_names):
69 ax.hist2d(df[feature1], df[feature2], bins=(nbins, nbins), cmap=
'jet', cmin=1, norm=
"log")
70 ax.set_xlabel(feature1)
71 ax.set_ylabel(feature2)
72 cbar = plt.colorbar(ax.collections[0], ax=ax)
73 cbar.set_label(
'Events', rotation=90)
76 plt.savefig(
'results/correlations.png')
97 combinations =
int(np.math.factorial(len(feature_names)) / (2 * np.math.factorial(len(feature_names) - 2)))
100 fig, axes = plt.subplots(1, 1 + combinations, figsize=(8 * (1 + combinations), 8))
103 fractions = [np.sum(clf.labels_ == k)/len(X)
for k
in range(clf.n_clusters)]
104 colors = [plt.cm.viridis(i)
for i
in np.linspace(0, 1, clf.n_clusters)]
105 counts = axes[0].bar(range(clf.n_clusters), fractions, color=colors)
106 axes[0].set_xlabel(
'Cluster')
107 axes[0].set_ylabel(
'Fraction of events')
110 for i, feature1
in enumerate(feature_names):
111 for j, feature2
in enumerate(feature_names):
115 ax.scatter(max_values[feature1] * X[:, i], max_values[feature2] * X[:, j], c=clf.labels_, s=0.1, cmap=
'viridis')
116 ax.scatter(max_values[feature1] * clf.cluster_centers_[:, i],max_values[feature2]*clf.cluster_centers_[:, j], marker=
'x', color=
'red', s=100)
117 ax.set_xlabel(feature1)
118 ax.set_ylabel(feature2)
121 plt.savefig(
'results/clusters.png')
125 from ROOT
import TCanvas, gPad, TH1F, TFile
128 c1 = TCanvas(
'c1',
'Probability density functions', 800*clf.n_clusters, 800*len(feature_names))
129 c1.Divide(clf.n_clusters, len(feature_names))
135 for feat_name
in feature_names:
136 for cluster
in range(clf.n_clusters):
137 hist_name = f
"cluster_{cluster}/{feat_name}"
138 hist_title = hist_name
139 hist = TH1F(hist_name, hist_title, nbins, 0, df[feat_name].
max())
140 histos[hist_name] = hist
143 for event
in range(len(df)):
144 cluster = clf.labels_[event]
145 for feat_name
in feature_names:
146 hist_name = f
"cluster_{cluster}/{feat_name}"
147 hist = histos[hist_name]
148 hist.Fill(df[feat_name][event])
152 for feat_name
in feature_names:
153 for cluster
in range(clf.n_clusters):
156 hist_name = f
"cluster_{cluster}/{feat_name}"
157 hist = histos[hist_name]
158 hist.Scale(1./hist.Integral())
164 c1.SaveAs(
"results/feature_pdfs.png")
165 c1.SaveAs(
"results/feature_pdfs_cc.root")
167 root_file = TFile(
"results/feature_pdfs.root",
"RECREATE")
168 for hist_name
in histos:
169 hist = histos[hist_name]
174def main(input_root_file, feature_names, nbins, n_clusters, eps, load_model):
178 df =
load_data(input_root_file, feature_names)
184 X = df[[f
'normalized_{feat_name}' for feat_name
in feature_names]].values
186 if load_model ==
"None":
187 from k_means_constrained
import KMeansConstrained
190 clf = KMeansConstrained(n_clusters=n_clusters, size_min=
int(len(X)/n_clusters), size_max=
int((1.0+eps)*len(X)/n_clusters), n_jobs=-1)
193 with open(
"results/model.pkl",
"wb")
as f:
197 with open(load_model,
"rb")
as f:
212 parser.add_argument(
"-e",
"--eps", type=float, default=0.01, help=
"Value of fluctuation of statistics in clusters")
218 main(args.path, feature_names, args.nbins, args.nclusters, args.eps, args.model)