|
1 | 1 | """ |
2 | 2 | .. redirect-from:: /gallery/statistics/histogram_features |
3 | 3 |
|
4 | | -=================================== |
5 | | -Histogram bins, density, and weight |
6 | | -=================================== |
| 4 | +.. _histogram_normalization: |
| 5 | +
|
| 6 | +================================== |
| 7 | +bins, density, and weights in hist |
| 8 | +================================== |
7 | 9 |
|
8 | 10 | The `.Axes.hist` method can flexibly create histograms in a few different ways, |
9 | 11 | which is flexible and helpful, but can also lead to confusion. In particular, |
10 | | -you can: |
| 12 | +the method has the following parameters: |
11 | 13 |
|
12 | | -- bin the data as you want, either with an automatically chosen number of |
| 14 | +-``bins``bin the data as you want, either with an automatically chosen number of |
13 | 15 | bins, or with fixed bin edges, |
14 | | -- normalize the histogram so that its integral is one, |
15 | | --and assign weights tothe datapoints, so that each data point affects the |
16 | | - countin its bindifferently. |
| 16 | +-``density``normalize the histogram so that its integral is one, |
| 17 | +-``weights`` assign weights toeach datapoint so that each point affects the |
| 18 | + count differently. |
17 | 19 |
|
18 | 20 | The Matplotlib ``hist`` method calls `numpy.histogram` and plots the results, |
19 | 21 | therefore users should consult the numpy documentation for a definitive guide. |
20 | 22 |
|
21 | 23 | Histograms are created by defining bin edges, and taking a dataset of values |
22 | 24 | and sorting them into the bins, and counting or summing how much data is in |
23 | | -each bin. In this simple example, 9 numbers between 1 and 4 are sorted into 3 |
24 | | -bins: |
| 25 | +each bin. In this example, 9 numbers between 1 and 4 are sorted into 3 bins: |
25 | 26 | """ |
26 | 27 |
|
27 | 28 | importmatplotlib.pyplotasplt |
|
36 | 37 | # very clear where the boundaries of the bins are: |
37 | 38 | style= {'facecolor':'none','edgecolor':'C0','linewidth':3} |
38 | 39 |
|
39 | | -fig,ax=plt.subplots() |
| 40 | +fig,ax=plt.subplots(layout='constrained',figsize=(8,4)) |
| 41 | + |
| 42 | +# count the number of values in xdata between each value in xbins |
40 | 43 | ax.hist(xdata,bins=xbins,**style) |
41 | 44 |
|
42 | | -# plot the xdatalocations on the x axis: |
43 | | -ax.plot(xdata,0*xdata,'d') |
44 | | -ax.set_ylabel('Number per bin') |
45 | | -ax.set_xlabel('x bins (dx=1.0)') |
| 45 | +# plot the xdataevents |
| 46 | +ax.eventplot(xdata,color='C1',alpha=.5) |
| 47 | + |
| 48 | +ax.set(xlabel='Number per bin',ylabel='x bins (dx=1.0)',title='histogram') |
46 | 49 |
|
47 | 50 | # %% |
48 | | -#Modifyingbins |
49 | | -# ============== |
| 51 | +# bins |
| 52 | +# ==== |
50 | 53 | # |
51 | 54 | # Changing the bin size changes the shape of this sparse histogram, so its a |
52 | | -# good idea to choose bins with some care with respect to your data. Here we |
53 | | -# make the bins half as wide. |
| 55 | +# good idea to choose bins with some care with respect to your data. The `.Axes.hist` |
| 56 | +# *bins* parameter accepts either the number of bins or a list of bin edges. |
| 57 | +# |
| 58 | +# |
| 59 | +# Fixed bin edges |
| 60 | +# --------------- |
| 61 | +# |
| 62 | +# Here the bins are set to the list of edges [1, 1.5, 2, 2.5, 3, 3.5, 4]. |
| 63 | +# This is half as wide as the previous example. |
54 | 64 |
|
55 | 65 | xbins=np.arange(1,4.5,0.5) |
56 | 66 |
|
57 | | -fig,ax=plt.subplots() |
| 67 | +fig,ax=plt.subplots(layout='constrained',figsize=(8,3)) |
| 68 | + |
58 | 69 | ax.hist(xdata,bins=xbins,**style) |
59 | | -ax.plot(xdata,0*xdata,'d') |
60 | | -ax.set_ylabel('Number per bin') |
61 | | -ax.set_xlabel('x bins (dx=0.5)') |
| 70 | + |
| 71 | +ax.eventplot(xdata,lineoffsets=.5,color='C1',alpha=.5) |
| 72 | + |
| 73 | +ax.set(ylabel='cpunt',xlabel='x bins (dx=0.5)', |
| 74 | +title='fixed bin edges: bins=np.arange(1, 4.5, .5)',) |
62 | 75 |
|
63 | 76 | # %% |
| 77 | +# |
| 78 | +# Number of bins |
| 79 | +# -------------- |
| 80 | +# |
64 | 81 | # We can also let numpy (via Matplotlib) choose the bins automatically, or |
65 | 82 | # specify a number of bins to choose automatically: |
66 | 83 |
|
67 | | -fig,ax=plt.subplot_mosaic([['auto','n4']], |
68 | | -sharex=True,sharey=True,layout='constrained') |
| 84 | +fig,ax=plt.subplot_mosaic([['auto'], ['n4']], |
| 85 | +sharex=True,sharey=True, |
| 86 | +layout='constrained',figsize=(8,6)) |
69 | 87 |
|
70 | 88 | ax['auto'].hist(xdata,**style) |
71 | | -ax['auto'].plot(xdata,0*xdata,'d') |
72 | | -ax['auto'].set_ylabel('Number per bin') |
73 | | -ax['auto'].set_xlabel('x bins (auto)') |
| 89 | +ax['auto'].eventplot(xdata,lineoffsets=.5,color='C1',alpha=.5) |
| 90 | + |
| 91 | +ax['auto'].set(ylabel='count',xlabel='x bins', |
| 92 | +title='dynamically computed bin edges: bins="auto"') |
74 | 93 |
|
75 | 94 | ax['n4'].hist(xdata,bins=4,**style) |
76 | | -ax['n4'].plot(xdata,0*xdata,'d') |
77 | | -ax['n4'].set_xlabel('x bins ("bins=4")') |
| 95 | +ax['n4'].eventplot(xdata,lineoffsets=.5,color='C1',alpha=.5) |
| 96 | + |
| 97 | +ax['n4'].set(ylabel='count',xlabel='x bins', |
| 98 | +title='fixed number of bins: bins=4',) |
78 | 99 |
|
79 | 100 | # %% |
80 | | -#Normalizing histograms:density and weight |
81 | | -# ========================================== |
| 101 | +# density |
| 102 | +# ======= |
82 | 103 | # |
83 | 104 | # Counts-per-bin is the default length of each bar in the histogram. However, |
84 | 105 | # we can also normalize the bar lengths as a probability density function using |
85 | 106 | # the ``density`` parameter: |
86 | 107 |
|
87 | | -fig,ax=plt.subplots() |
| 108 | +fig,ax=plt.subplots(layout='constrained',figsize=(8,3)) |
| 109 | + |
88 | 110 | ax.hist(xdata,bins=xbins,density=True,**style) |
89 | | -ax.set_ylabel('Probability density [$V^{-1}$])') |
90 | | -ax.set_xlabel('x bins (dx=0.5 $V$)') |
| 111 | + |
| 112 | +ax.set(ylabel='Probability density [$V^{-1}$])', |
| 113 | +xlabel='x bins (dx=0.5 $V$)', |
| 114 | +title='normalizing histogram using density') |
91 | 115 |
|
92 | 116 | # %% |
93 | 117 | # This normalization can be a little hard to interpret when just exploring the |
94 | 118 | # data. The value attached to each bar is divided by the total number of data |
95 | | -# points *and* the width of the bin, and thus the values_integrate_ to one |
| 119 | +# points *and* the width of the bin, and thus the values*integrate* to one |
96 | 120 | # when integrating across the full range of data. |
97 | 121 | # e.g. :: |
98 | 122 | # |
|
121 | 145 | # distribution function by both the length of the data and the width of the |
122 | 146 | # bins: |
123 | 147 |
|
124 | | -fig,ax=plt.subplot_mosaic([['False','True']],layout='constrained') |
125 | 148 | dx=0.1 |
126 | 149 | xbins=np.arange(-4,4,dx) |
127 | | -ax['False'].hist(xdata,bins=xbins,density=False,histtype='step',label='Counts') |
128 | 150 |
|
| 151 | +fig,ax=plt.subplot_mosaic([['False','True']],layout='constrained', |
| 152 | +figsize=(8,4)) |
| 153 | + |
| 154 | + |
| 155 | +ax['False'].hist(xdata,bins=xbins,density=False,histtype='step',label='Counts') |
129 | 156 | # scale and plot the expected pdf: |
130 | | -ax['False'].plot(xpdf,pdf*len(xdata)*dx,label=r'$N\,f_X(x)\,\delta x$') |
131 | | -ax['False'].set_ylabel('Count per bin') |
132 | | -ax['False'].set_xlabel('x bins [V]') |
133 | | -ax['False'].legend() |
| 157 | +ax['False'].plot(xpdf,pdf*len(xdata)*dx,label=r'$N\,f_X(x)\,\delta x$',alpha=.5) |
| 158 | + |
134 | 159 |
|
135 | 160 | ax['True'].hist(xdata,bins=xbins,density=True,histtype='step',label='density') |
136 | | -ax['True'].plot(xpdf,pdf,label='$f_X(x)$') |
137 | | -ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
138 | | -ax['True'].set_xlabel('x bins [$V$]') |
| 161 | +ax['True'].plot(xpdf,pdf,label='$f_X(x)$',alpha=.5) |
| 162 | + |
| 163 | + |
| 164 | +ax['False'].set(ylabel='Count per bin',xlabel='x bins [V]', |
| 165 | +title="normalization using scaling, density=False") |
| 166 | +ax['False'].legend() |
| 167 | +ax['True'].set(ylabel='Probability density [$V^{-1}$]',xlabel='x bins [$V$]', |
| 168 | +title="density=True") |
139 | 169 | ax['True'].legend() |
140 | 170 |
|
141 | 171 | # %% |
142 | | -# One advantage of using the density is therefore that the shape and amplitude |
143 | | -# of the histogram does not depend on the size of the bins. Consider an |
144 | | -# extreme case where the bins do not have the same width. In this example, the |
145 | | -# bins below ``x=-1.25`` are six times wider than the rest of the bins. By |
146 | | -# normalizing by density, we preserve the shape of the distribution, whereas if |
147 | | -# we do not, then the wider bins have much higher counts than the thinner bins: |
148 | | - |
149 | | -fig,ax=plt.subplot_mosaic([['False','True']],layout='constrained') |
| 172 | +# weights |
| 173 | +# ======= |
| 174 | +# |
| 175 | +# Sometimes people want to normalize so that the sum of counts is one. This is |
| 176 | +# analogous to a `probability mass function |
| 177 | +# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete |
| 178 | +# variable where the sum of probabilities for all the values equals one. |
| 179 | +# |
| 180 | +# Using ``hist``, we can get this normalization if we set the *weights* to 1/N. |
| 181 | +# Note that the amplitude of this normalized histogram still depends on |
| 182 | +# width and/or number of bins: |
| 183 | + |
| 184 | +fig,ax=plt.subplots(layout='constrained',figsize=(8,3)) |
| 185 | + |
| 186 | +fornn,dxinenumerate([0.1,0.4,1.2]): |
| 187 | +xbins=np.arange(-4,4,dx) |
| 188 | +ax.hist(xdata,bins=xbins,weights=1/len(xdata)*np.ones(len(xdata)), |
| 189 | +histtype='step',label=f'{dx}') |
| 190 | + |
| 191 | +ax.set(ylabel='Bin count / N',xlabel='x bins [$V$]', |
| 192 | +title="histogram normalization using weights") |
| 193 | +ax.legend(fontsize='small',title='bin width:') |
| 194 | + |
| 195 | +# %% |
| 196 | +# Case studies |
| 197 | +# ============ |
| 198 | +# |
| 199 | +# Histogram normalization is used to compare histograms of different populations or |
| 200 | +# generated in different ways. |
| 201 | +# |
| 202 | +# Irregularly spaced bins |
| 203 | +# ----------------------- |
| 204 | +# One advantage of using the density is that the shape and amplitude of the histogram |
| 205 | +# does not depend on the size of the bins. Consider an extreme case where the bins do |
| 206 | +# not have the same width. In this example, the bins below ``x=-1.25`` are six times |
| 207 | +# wider than the rest of the bins. By normalizing by density, we preserve the shape of |
| 208 | +# the distribution, whereas if we do not, then the wider bins have much higher counts |
| 209 | +# than the thinner bins: |
| 210 | + |
150 | 211 | dx=0.1 |
151 | 212 | xbins=np.hstack([np.arange(-4,-1.25,6*dx),np.arange(-1.25,4,dx)]) |
| 213 | + |
| 214 | +fig,ax=plt.subplot_mosaic([['False','True']], |
| 215 | +layout='constrained',figsize=(8,3)) |
| 216 | + |
| 217 | + |
152 | 218 | ax['False'].hist(xdata,bins=xbins,density=False,histtype='step',label='Counts') |
153 | | -ax['False'].plot(xpdf,pdf*len(xdata)*dx,label=r'$N\,f_X(x)\,\delta x_0$') |
154 | | -ax['False'].set_ylabel('Count per bin') |
155 | | -ax['False'].set_xlabel('x bins [V]') |
156 | | -ax['False'].legend() |
| 219 | +ax['False'].plot(xpdf,pdf*len(xdata)*dx,label=r'$N\,f_X(x)\,\delta x_0$', |
| 220 | +alpha=.5) |
157 | 221 |
|
158 | 222 | ax['True'].hist(xdata,bins=xbins,density=True,histtype='step',label='density') |
159 | | -ax['True'].plot(xpdf,pdf,label='$f_X(x)$') |
160 | | -ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
161 | | -ax['True'].set_xlabel('x bins [$V$]') |
| 223 | +ax['True'].plot(xpdf,pdf,label='$f_X(x)$',alpha=.5) |
| 224 | + |
| 225 | + |
| 226 | +ax['False'].set(ylabel='Count per bin',xlabel='x bins [V]', |
| 227 | +title="irregularly spaced bins, density=False") |
| 228 | +ax['False'].legend() |
| 229 | + |
| 230 | +ax['True'].set(ylabel='Probability density [$V^{-1}$]',xlabel='x bins [$V$]', |
| 231 | +title="irregularly spaced bins, density=True",) |
162 | 232 | ax['True'].legend() |
163 | 233 |
|
164 | 234 | # %% |
| 235 | +# Different bin widths |
| 236 | +# -------------------- |
| 237 | +# |
165 | 238 | # Similarly, if we want to compare histograms with different bin widths, we may |
166 | 239 | # want to use ``density=True``: |
167 | 240 |
|
168 | | -fig,ax=plt.subplot_mosaic([['False','True']],layout='constrained') |
| 241 | +fig,ax=plt.subplot_mosaic([['False','True']], |
| 242 | +layout='constrained',figsize=(8,3)) |
169 | 243 |
|
170 | 244 | # expected PDF |
171 | 245 | ax['True'].plot(xpdf,pdf,'--',label='$f_X(x)$',color='k') |
172 | 246 |
|
173 | 247 | fornn,dxinenumerate([0.1,0.4,1.2]): |
174 | 248 | xbins=np.arange(-4,4,dx) |
175 | 249 | # expected histogram: |
176 | | -ax['False'].plot(xpdf,pdf*1000*dx,'--',color=f'C{nn}') |
| 250 | +ax['False'].plot(xpdf,pdf*1000*dx,'--',color=f'C{nn}',alpha=.5) |
177 | 251 | ax['False'].hist(xdata,bins=xbins,density=False,histtype='step') |
178 | 252 |
|
179 | 253 | ax['True'].hist(xdata,bins=xbins,density=True,histtype='step',label=dx) |
180 | 254 |
|
181 | | -# Labels: |
182 | | -ax['False'].set_xlabel('x bins [$V$]') |
183 | | -ax['False'].set_ylabel('Count per bin') |
184 | | -ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
185 | | -ax['True'].set_xlabel('x bins [$V$]') |
| 255 | +ax['False'].set(ylabel='Count per bin',xlabel='x bins [$V$]', |
| 256 | +title="density=False") |
| 257 | +ax['True'].set(ylabel='Probability density [$V^{-1}$]',xlabel='x bins [$V$]', |
| 258 | +title='density=True') |
186 | 259 | ax['True'].legend(fontsize='small',title='bin width:') |
187 | | - |
188 | | -# %% |
189 | | -# Sometimes people want to normalize so that the sum of counts is one. This is |
190 | | -# analogous to a `probability mass function |
191 | | -# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete |
192 | | -# variable where the sum of probabilities for all the values equals one. Using |
193 | | -# ``hist``, we can get this normalization if we set the *weights* to 1/N. |
194 | | -# Note that the amplitude of this normalized histogram still depends on |
195 | | -# width and/or number of the bins: |
196 | | - |
197 | | -fig,ax=plt.subplots(layout='constrained',figsize=(3.5,3)) |
198 | | - |
199 | | -fornn,dxinenumerate([0.1,0.4,1.2]): |
200 | | -xbins=np.arange(-4,4,dx) |
201 | | -ax.hist(xdata,bins=xbins,weights=1/len(xdata)*np.ones(len(xdata)), |
202 | | -histtype='step',label=f'{dx}') |
203 | | -ax.set_xlabel('x bins [$V$]') |
204 | | -ax.set_ylabel('Bin count / N') |
205 | | -ax.legend(fontsize='small',title='bin width:') |
206 | | - |
207 | 260 | # %% |
| 261 | +# Populations of different sizes |
| 262 | +# ------------------------------ |
| 263 | +# |
208 | 264 | # The value of normalizing histograms is comparing two distributions that have |
209 | | -# different sized populations.Here we compare the distribution of ``xdata`` |
| 265 | +# different sized populations. Here we compare the distribution of ``xdata`` |
210 | 266 | # with a population of 1000, and ``xdata2`` with 100 members. |
211 | 267 |
|
212 | 268 | xdata2=rng.normal(size=100) |
213 | 269 |
|
214 | 270 | fig,ax=plt.subplot_mosaic([['no_norm','density','weight']], |
215 | | -layout='constrained',figsize=(8,4)) |
| 271 | +layout='constrained',figsize=(9,3)) |
216 | 272 |
|
217 | 273 | xbins=np.arange(-4,4,0.25) |
218 | 274 |
|
219 | | -ax['no_norm'].hist(xdata,bins=xbins,histtype='step') |
220 | | -ax['no_norm'].hist(xdata2,bins=xbins,histtype='step') |
221 | | -ax['no_norm'].set_ylabel('Counts') |
222 | | -ax['no_norm'].set_xlabel('x bins [$V$]') |
223 | | -ax['no_norm'].set_title('No normalization') |
224 | | - |
225 | | -ax['density'].hist(xdata,bins=xbins,histtype='step',density=True) |
226 | | -ax['density'].hist(xdata2,bins=xbins,histtype='step',density=True) |
227 | | -ax['density'].set_ylabel('Probability density [$V^{-1}$]') |
228 | | -ax['density'].set_title('Density=True') |
229 | | -ax['density'].set_xlabel('x bins [$V$]') |
230 | | - |
231 | | -ax['weight'].hist(xdata,bins=xbins,histtype='step', |
232 | | -weights=1/len(xdata)*np.ones(len(xdata)), |
233 | | -label='N=1000') |
234 | | -ax['weight'].hist(xdata2,bins=xbins,histtype='step', |
235 | | -weights=1/len(xdata2)*np.ones(len(xdata2)), |
236 | | -label='N=100') |
237 | | -ax['weight'].set_xlabel('x bins [$V$]') |
238 | | -ax['weight'].set_ylabel('Counts / N') |
| 275 | +forxdin [xdata,xdata2]: |
| 276 | +ax['no_norm'].hist(xd,bins=xbins,histtype='step') |
| 277 | +ax['density'].hist(xd,bins=xbins,histtype='step',density=True) |
| 278 | +N=len(xd) |
| 279 | +ax['weight'].hist(xd,bins=xbins,histtype='step',weights=1/N*np.ones(N), |
| 280 | +label=f'N={N}') |
| 281 | + |
| 282 | + |
| 283 | +ax['no_norm'].set(ylabel='Counts',xlabel='x bins [$V$]', |
| 284 | +title='No normalization') |
| 285 | +ax['density'].set(ylabel='Probability density [$V^{-1}$]',xlabel='x bins [$V$]', |
| 286 | +title='Density=True') |
| 287 | +ax['weight'].set(ylabel='Counts / N',xlabel='x bins [$V$]', |
| 288 | +title='Weight = 1/N') |
239 | 289 | ax['weight'].legend(fontsize='small') |
240 | | -ax['weight'].set_title('Weight = 1/N') |
241 | 290 |
|
242 | 291 | plt.show() |
243 | 292 |
|
|
253 | 302 | # - `matplotlib.axes.Axes.set_xlabel` |
254 | 303 | # - `matplotlib.axes.Axes.set_ylabel` |
255 | 304 | # - `matplotlib.axes.Axes.legend` |
| 305 | +# |