aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJuan Marín Noguera <juan@mnpi.eu>2025-04-18 18:57:14 +0200
committerJuan Marín Noguera <juan@mnpi.eu>2025-04-18 18:57:14 +0200
commita26ecf585ec33763381173d51e9d35485ad2ad18 (patch)
tree96847d75586e408b63c856b14e2f59c54aa7b6eb
parent1eae6ac7dc189236e763b111075c8fba20fb27c3 (diff)
3.4.2 Random Sampling and Shuffling
-rw-r--r--3.4.2.lyx536
-rw-r--r--index.lyx23
2 files changed, 559 insertions, 0 deletions
diff --git a/3.4.2.lyx b/3.4.2.lyx
new file mode 100644
index 0000000..8b8ae25
--- /dev/null
+++ b/3.4.2.lyx
@@ -0,0 +1,536 @@
+#LyX 2.4 created this file. For more info see https://www.lyx.org/
+\lyxformat 620
+\begin_document
+\begin_header
+\save_transient_properties true
+\origin unavailable
+\textclass book
+\begin_preamble
+\input defs
+\end_preamble
+\use_default_options true
+\maintain_unincluded_children no
+\language english
+\language_package default
+\inputencoding utf8
+\fontencoding auto
+\font_roman "default" "default"
+\font_sans "default" "default"
+\font_typewriter "default" "default"
+\font_math "auto" "auto"
+\font_default_family default
+\use_non_tex_fonts false
+\font_sc false
+\font_roman_osf false
+\font_sans_osf false
+\font_typewriter_osf false
+\font_sf_scale 100 100
+\font_tt_scale 100 100
+\use_microtype false
+\use_dash_ligatures true
+\graphics default
+\default_output_format default
+\output_sync 0
+\bibtex_command default
+\index_command default
+\float_placement class
+\float_alignment class
+\paperfontsize default
+\spacing single
+\use_hyperref false
+\papersize default
+\use_geometry false
+\use_package amsmath 1
+\use_package amssymb 1
+\use_package cancel 1
+\use_package esint 1
+\use_package mathdots 1
+\use_package mathtools 1
+\use_package mhchem 1
+\use_package stackrel 1
+\use_package stmaryrd 1
+\use_package undertilde 1
+\cite_engine basic
+\cite_engine_type default
+\biblio_style plain
+\use_bibtopic false
+\use_indices false
+\paperorientation portrait
+\suppress_date false
+\justification true
+\use_refstyle 1
+\use_formatted_ref 0
+\use_minted 0
+\use_lineno 0
+\index Index
+\shortcut idx
+\color #008000
+\end_index
+\secnumdepth 3
+\tocdepth 3
+\paragraph_separation indent
+\paragraph_indentation default
+\is_math_indent 0
+\math_numbering_side default
+\quotes_style english
+\dynamic_quotes 0
+\papercolumns 1
+\papersides 1
+\paperpagestyle default
+\tablestyle default
+\tracking_changes false
+\output_changes false
+\change_bars false
+\postpone_fragile_content false
+\html_math_output 0
+\html_css_as_file 0
+\html_be_strict false
+\docbook_table_output 0
+\docbook_mathml_prefix 1
+\end_header
+
+\begin_body
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+rexerc3[22]
+\end_layout
+
+\end_inset
+
+The
+\begin_inset Formula $(t+1)$
+\end_inset
+
+st item in Algorithm S is selected with probability
+\begin_inset Formula $(n-m)/(N-t)$
+\end_inset
+
+,
+ not
+\begin_inset Formula $n/N$
+\end_inset
+
+,
+ yet the text claims that the sample is unbiased;
+ thus each item should be selected with the
+\emph on
+same
+\emph default
+ probability.
+ How can both of these statements be true?
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+answer
+\end_layout
+
+\end_inset
+
+This is because
+\begin_inset Formula $\frac{n-m}{N-t}$
+\end_inset
+
+ appears as a
+\emph on
+conditioned
+\emph default
+ probability,
+ where the condition depends on
+\begin_inset Formula $t$
+\end_inset
+
+.
+ The actual probability of choosing the
+\begin_inset Formula $(t+1)$
+\end_inset
+
+st element is
+\begin_inset Formula
+\[
+\sum_{m=0}^{t}P(\{m\text{ elements have been chosen between }1\text{ and }t\})\frac{n-m}{N-t}.
+\]
+
+\end_inset
+
+Let
+\begin_inset Formula $P_{mt}$
+\end_inset
+
+ be the probability described with words inside this formula.
+ For
+\begin_inset Formula $m>t$
+\end_inset
+
+ or
+\begin_inset Formula $m<0$
+\end_inset
+
+,
+ this probability is obviously 0,
+ and for
+\begin_inset Formula $t=0$
+\end_inset
+
+,
+
+\begin_inset Formula $P_{00}=1$
+\end_inset
+
+.
+ For
+\begin_inset Formula $t\geq1$
+\end_inset
+
+ and
+\begin_inset Formula $0\leq m\leq t$
+\end_inset
+
+,
+
+\begin_inset Formula
+\[
+P_{mt}=P_{m(t-1)}\left(1-\frac{n-m}{N-t+1}\right)+P_{(m-1)(t-1)}\frac{n-m+1}{N-t+1}.
+\]
+
+\end_inset
+
+Note that,
+ by this formula,
+
+\begin_inset Formula $P_{(n-1)t}=0$
+\end_inset
+
+ always,
+ since the second term is 0 and the first term is a multiple of
+\begin_inset Formula $P_{(n-1)(t-1)}$
+\end_inset
+
+,
+ which is 0 by induction since
+\begin_inset Formula $P_{(n-1)(-1)}=0$
+\end_inset
+
+.
+ Then,
+ by induction in
+\begin_inset Formula $m$
+\end_inset
+
+,
+
+\begin_inset Formula $P_{mt}=0$
+\end_inset
+
+ for any
+\begin_inset Formula $m>n$
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+Since the algorithm is said to be unbiased,
+ we would expect
+\begin_inset Formula
+\[
+P_{mt}=\frac{\binom{t}{m}\binom{N-t}{n-m}}{\binom{N}{n}},
+\]
+
+\end_inset
+
+that is,
+ the number of ways to choose
+\begin_inset Formula $m$
+\end_inset
+
+ elements among the first
+\begin_inset Formula $t$
+\end_inset
+
+ ones and
+\begin_inset Formula $n-m$
+\end_inset
+
+ elements among the rest,
+ divided by the total number of ways of choosing
+\begin_inset Formula $n$
+\end_inset
+
+ elements among
+\begin_inset Formula $N$
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+We prove this by induction.
+ For
+\begin_inset Formula $t=0$
+\end_inset
+
+ we have seen it already.
+ For
+\begin_inset Formula $t>0$
+\end_inset
+
+,
+ if
+\begin_inset Formula $m=0$
+\end_inset
+
+,
+\begin_inset Formula
+\[
+P_{0t}=P_{0(t-1)}\left(1-\frac{n}{N-t+1}\right)=\frac{\binom{N-t+1}{n}}{\binom{N}{n}}\frac{N-t-n+1}{N-t+1}=\frac{\binom{N-t}{n}}{\binom{N}{n}},
+\]
+
+\end_inset
+
+and if
+\begin_inset Formula $m>0$
+\end_inset
+
+,
+\begin_inset Formula
+\begin{align*}
+P_{mt} & =\frac{\binom{t-1}{m}\binom{N-t+1}{n-m}}{\binom{N}{n}}\frac{N-t-n+m+1}{N-t+1}+\frac{\binom{t-1}{m-1}\binom{N-t+1}{n-m+1}}{\binom{N}{n}}\frac{n-m+1}{N-t+1}=\\
+ & =\frac{\binom{t-1}{m}\binom{N-t}{n-m}}{\binom{N}{n}}+\frac{\binom{t-1}{m-1}\binom{N-t}{n-m}}{\binom{N}{n}}=\frac{\binom{t}{m}\binom{N-t}{n-m}}{\binom{N}{n}},
+\end{align*}
+
+\end_inset
+
+where we make extensive use of Eq.
+ 1.2.6–(8).
+ Finally,
+ by Eq.
+ 1.2.6–(21),
+ the probability of choosing any given element is
+\begin_inset Formula
+\[
+\sum_{m=0}^{t}P_{mt}\frac{n-m}{N-t}=\frac{1}{\binom{N}{n}}\sum_{m=0}^{t}\binom{t}{m}\binom{N-1-t}{n-1-m}=\frac{\binom{N-1}{n-1}}{\binom{N}{n}}=\frac{\binom{N-1}{N-n}}{\binom{N}{N-n}}=\frac{n}{N}.
+\]
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+rexerc11[M25]
+\end_layout
+
+\end_inset
+
+Let
+\begin_inset Formula $p_{m}$
+\end_inset
+
+ be the probability that exactly
+\begin_inset Formula $m$
+\end_inset
+
+ elements are put into the reservoir during the first pass of Algorithm R.
+ Determine the generating function
+\begin_inset Formula $G(z)=\sum_{m}p_{m}z^{m}$
+\end_inset
+
+,
+ and find the mean and standard deviation.
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+answer
+\end_layout
+
+\end_inset
+
+Let
+\begin_inset Formula $s_{k}\coloneqq P(\text{the }k\text{th element is copied to the reservoir})$
+\end_inset
+
+,
+ then
+\begin_inset Formula $p_{m}$
+\end_inset
+
+ is the sum across all subsets
+\begin_inset Formula $\{x_{1},\dots,x_{m}\}\in\{1,\dots,N\}$
+\end_inset
+
+ of
+\begin_inset Formula $m$
+\end_inset
+
+ elements,
+
+\begin_inset Formula $x_{1}<\dots<x_{m}$
+\end_inset
+
+,
+ of the product of the
+\begin_inset Formula $s_{x_{i}}$
+\end_inset
+
+ times the
+\begin_inset Formula $(1-s_{j})$
+\end_inset
+
+ for
+\begin_inset Formula $j\in\{1,\dots N\}\setminus\{x_{1},\dots,x_{m}\}$
+\end_inset
+
+.
+ Rearranging terms we can see that
+\begin_inset Formula
+\[
+G(z)=\prod_{k=1}^{N}(s_{k}z+1-s_{k})=z^{n}\prod_{k=n+1}^{N}\left(\frac{nz}{k}+1-\frac{n}{k}\right)=z^{n}\prod_{k=n+1}^{N}\frac{n(z-1)+k}{k}.
+\]
+
+\end_inset
+
+Let
+\begin_inset Formula $F(z)\coloneqq\prod_{k=n+1}^{N}\frac{n(z-1)+k}{k}$
+\end_inset
+
+,
+ we have
+\begin_inset Formula $F(1)=1$
+\end_inset
+
+ and
+\begin_inset Formula
+\begin{align*}
+\dot{F}(z) & =F(z)\sum_{k=n+1}^{N}\frac{n}{n(z-1)+k}, & \dot{F}(1) & =\sum_{k=n+1}^{N}\frac{n}{k}=n(H_{N}-H_{n});\\
+\ddot{F}(z) & =\frac{\dot{F}(z)^{2}}{F(z)}-F(z)\sum_{k=n+1}^{N}\frac{n^{2}}{(n(z-1)+k)^{2}}, & \ddot{F}(1) & =n^{2}\left((H_{N}-H_{n})^{2}-(H_{N}^{(2)}-H_{n}^{(2)})\right).
+\end{align*}
+
+\end_inset
+
+Using this,
+\begin_inset Formula
+\begin{align*}
+\dot{G}(z) & =nz^{n-1}F(z)+z^{n}\dot{F}(z),\\
+\ddot{G}(z) & =n(n-1)z^{n-2}F(z)+2nz^{n-1}\dot{F}(z)+z^{n}\ddot{F}(z),
+\end{align*}
+
+\end_inset
+
+so
+\begin_inset Formula
+\begin{align*}
+\text{mean}(G) & =\dot{G}(1)=n(1+H_{N}-H_{n});\\
+\text{var}(G) & =\ddot{G}(1)+\dot{G}(1)-\dot{G}(1)^{2}=\cancel{n(n-1)}\cancel{+2n\dot{F}(1)}+\ddot{F}(1)\cancel{+n}+\dot{F}(1)\cancel{-n^{2}}-\dot{F}(1)^{2}\cancel{-2n\dot{F}(1)}\\
+ & =n(H_{N}-H_{n})-n^{2}(H_{N}^{(2)}-H_{n}^{(2)}),\\
+\sigma(G) & =\sqrt{\text{var}(G)}=\sqrt{n(H_{N}-H_{n})-n^{2}(H_{N}^{(2)}-H_{n}^{(2)})}.
+\end{align*}
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+rexerc16[M25]
+\end_layout
+
+\end_inset
+
+Devise a way to compute a random sample of
+\begin_inset Formula $n$
+\end_inset
+
+ records from
+\begin_inset Formula $N$
+\end_inset
+
+,
+ given
+\begin_inset Formula $N$
+\end_inset
+
+ and
+\begin_inset Formula $n$
+\end_inset
+
+,
+ based on the idea of hashing (Section 6.4).
+ Your method should use
+\begin_inset Formula $O(n)$
+\end_inset
+
+ storage locations and an average of
+\begin_inset Formula $O(n)$
+\end_inset
+
+ units of time,
+ and it should present the sample as a sorted set of integers
+\begin_inset Formula $1\leq X_{1}<X_{2}<\dots<X_{n}\leq N$
+\end_inset
+
+.
+\end_layout
+
+\begin_layout Standard
+\begin_inset ERT
+status open
+
+\begin_layout Plain Layout
+
+
+\backslash
+answer
+\end_layout
+
+\end_inset
+
+
+\begin_inset Note Note
+status open
+
+\begin_layout Plain Layout
+TODO Depends on Section 6.4.
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
+\end_body
+\end_document
diff --git a/index.lyx b/index.lyx
index 030ec8d..3090b9f 100644
--- a/index.lyx
+++ b/index.lyx
@@ -2135,6 +2135,29 @@ A10+R25
Random Sampling and Shuffling
\end_layout
+\begin_layout Standard
+\begin_inset CommandInset include
+LatexCommand input
+filename "3.4.2.lyx"
+literal "false"
+
+\end_inset
+
+
+\begin_inset Note Note
+status open
+
+\begin_layout Plain Layout
+
+\family typewriter
+A10+R25-16
+\end_layout
+
+\end_inset
+
+
+\end_layout
+
\begin_layout Section
What Is a Random Sequence?
\end_layout