diff options
| author | Juan Marín Noguera <juan@mnpi.eu> | 2025-05-18 18:34:49 +0200 |
|---|---|---|
| committer | Juan Marín Noguera <juan@mnpi.eu> | 2025-05-18 18:34:49 +0200 |
| commit | 32119b4cc2f104267462261e58831969eec5fcfb (patch) | |
| tree | 9e0cfafada2ac2efdf2638e8554617b531d29aa6 | |
| parent | eaaeab40bcfd97cc6f01971843022c1401ca1852 (diff) | |
4.2.2. Accuracy of Floating Point Arithmetic
| -rw-r--r-- | vol2/4.2.2.lyx | 1072 | ||||
| -rw-r--r-- | vol2/index.lyx | 47 |
2 files changed, 1119 insertions, 0 deletions
diff --git a/vol2/4.2.2.lyx b/vol2/4.2.2.lyx new file mode 100644 index 0000000..abcc87a --- /dev/null +++ b/vol2/4.2.2.lyx @@ -0,0 +1,1072 @@ +#LyX 2.4 created this file. For more info see https://www.lyx.org/ +\lyxformat 620 +\begin_document +\begin_header +\save_transient_properties true +\origin unavailable +\textclass book +\begin_preamble +\input defs +\end_preamble +\use_default_options true +\maintain_unincluded_children no +\language english +\language_package default +\inputencoding utf8 +\fontencoding auto +\font_roman "default" "default" +\font_sans "default" "default" +\font_typewriter "default" "default" +\font_math "auto" "auto" +\font_default_family default +\use_non_tex_fonts false +\font_sc false +\font_roman_osf false +\font_sans_osf false +\font_typewriter_osf false +\font_sf_scale 100 100 +\font_tt_scale 100 100 +\use_microtype false +\use_dash_ligatures true +\graphics default +\default_output_format default +\output_sync 0 +\bibtex_command default +\index_command default +\float_placement class +\float_alignment class +\paperfontsize default +\spacing single +\use_hyperref false +\papersize default +\use_geometry false +\use_package amsmath 1 +\use_package amssymb 1 +\use_package cancel 1 +\use_package esint 1 +\use_package mathdots 1 +\use_package mathtools 1 +\use_package mhchem 1 +\use_package stackrel 1 +\use_package stmaryrd 1 +\use_package undertilde 1 +\cite_engine basic +\cite_engine_type default +\biblio_style plain +\use_bibtopic false +\use_indices false +\paperorientation portrait +\suppress_date false +\justification true +\use_refstyle 1 +\use_formatted_ref 0 +\use_minted 0 +\use_lineno 0 +\index Index +\shortcut idx +\color #008000 +\end_index +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation indent +\paragraph_indentation default +\is_math_indent 0 +\math_numbering_side default +\quotes_style english +\dynamic_quotes 0 +\papercolumns 1 +\papersides 1 +\paperpagestyle default +\tablestyle default +\tracking_changes false +\output_changes false +\change_bars false +\postpone_fragile_content false +\html_math_output 0 +\html_css_as_file 0 +\html_be_strict false +\docbook_table_output 0 +\docbook_mathml_prefix 1 +\end_header + +\begin_body + +\begin_layout Standard + +\emph on +Note: + +\emph default + Normalized floating point arithmetic is assumed unless the contrary is specified. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +exerc4[10] +\end_layout + +\end_inset + +Is it possible to have floating point numbers +\begin_inset Formula $u$ +\end_inset + +, + +\begin_inset Formula $v$ +\end_inset + +, + and +\begin_inset Formula $w$ +\end_inset + + for which exponent overflow occurs during the calculation of +\begin_inset Formula $u\otimes(v\otimes w)$ +\end_inset + + but not during the calculation of +\begin_inset Formula $(u\otimes v)\otimes w$ +\end_inset + +? +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +answer +\end_layout + +\end_inset + +Yes. + If, + say, + +\begin_inset Formula $b=10$ +\end_inset + +, + +\begin_inset Formula $q=8$ +\end_inset + +, + and overflow occurs when the exponent reaches 16, + let +\begin_inset Formula $u=(15,.10000001)$ +\end_inset + +, + +\begin_inset Formula $v=(9,.33333330)$ +\end_inset + +, + and +\begin_inset Formula $w=(9,.30000000)$ +\end_inset + +. + Then +\begin_inset Formula $v\otimes w=(9,.99999990)$ +\end_inset + + and +\begin_inset Formula $u\otimes(v\otimes w)=(16,.10000000)$ +\end_inset + +, + which raises an overflow, + but +\begin_inset Formula $u\otimes v=(15,.33333333)$ +\end_inset + + and +\begin_inset Formula $(u\otimes v)\otimes w=(15,.99999999)$ +\end_inset + +, + which does not raise an overflow. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +rexerc8[20] +\end_layout + +\end_inset + +Let +\begin_inset Formula $\epsilon=0.0001$ +\end_inset + +; + which of the relations +\begin_inset Formula +\begin{align*} +u & \prec v\quad(\epsilon), & u & \sim v\quad(\epsilon), & u & \succ v\quad(\epsilon), & u & \cong v\quad(\epsilon) +\end{align*} + +\end_inset + +hold for the following pairs of base 10, + excess 0, + eight-digit floating point numbers? +\end_layout + +\begin_layout Enumerate +\begin_inset Formula $u=(1,+.31415927)$ +\end_inset + +, + +\begin_inset Formula $v=(1,+.31416000)$ +\end_inset + +; +\end_layout + +\begin_layout Enumerate +\begin_inset Formula $u=(0,+.99997000)$ +\end_inset + +, + +\begin_inset Formula $v=(1,+.10000039)$ +\end_inset + +; +\end_layout + +\begin_layout Enumerate +\begin_inset Formula $u=(24,+.60221400)$ +\end_inset + +, + +\begin_inset Formula $v=(27,+.00060221)$ +\end_inset + +; +\end_layout + +\begin_layout Enumerate +\begin_inset Formula $u=(24,+.60221400)$ +\end_inset + +, + +\begin_inset Formula $v=(31,+.00000006)$ +\end_inset + +; +\end_layout + +\begin_layout Enumerate +\begin_inset Formula $u=(24,+.60221400)$ +\end_inset + +, + +\begin_inset Formula $v=(28,+.00000000)$ +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +answer +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Enumerate +\begin_inset Formula $\sim$ +\end_inset + +, + +\begin_inset Formula $\approx$ +\end_inset + +. +\end_layout + +\begin_layout Enumerate +\begin_inset Formula $\sim$ +\end_inset + +, + +\begin_inset Formula $\approx$ +\end_inset + +. +\end_layout + +\begin_layout Enumerate +\begin_inset Formula $\sim$ +\end_inset + +, + +\begin_inset Formula $\approx$ +\end_inset + +. +\end_layout + +\begin_layout Enumerate +\begin_inset Formula $\sim$ +\end_inset + +. +\end_layout + +\begin_layout Enumerate +\begin_inset Formula $\sim$ +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +rexerc10[M25] +\end_layout + +\end_inset + +(W. + M. + Kahan.) A certain computer performs floating point arithmetic without proper rounding, + and, + in fact, + its floating point multiplication routine ignores all but the first +\begin_inset Formula $p$ +\end_inset + + most significant digits of the +\begin_inset Formula $2p$ +\end_inset + +-digit product +\begin_inset Formula $f_{u}f_{v}$ +\end_inset + +. + (Thus when +\begin_inset Formula $f_{u}f_{v}<1/b$ +\end_inset + +, + the least-significant digit of +\begin_inset Formula $u\otimes v$ +\end_inset + + always comes out to be zero, + due to subsequent normalization.) Show that this causes the monotonicity of multiplication to fail; + in other words, + exhibit positive normalized floating point numbers +\begin_inset Formula $u$ +\end_inset + +, + +\begin_inset Formula $v$ +\end_inset + +, + and +\begin_inset Formula $w$ +\end_inset + + such that +\begin_inset Formula $u<v$ +\end_inset + + but +\begin_inset Formula $u\otimes w>v\otimes w$ +\end_inset + + on this machine. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +answer +\end_layout + +\end_inset + +Assume +\begin_inset Formula $p=4$ +\end_inset + +, + and let +\begin_inset Formula $u=.9999<1.000=v$ +\end_inset + + and let +\begin_inset Formula $w=.2222$ +\end_inset + +. + Then +\begin_inset Formula $u\otimes w=.2221>.2220=v\otimes w$ +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +rexerc13[M25] +\end_layout + +\end_inset + +Some programming languages (and even some computers) make use of floating point arithmetic only, + with no provision for exact calculations with integers. + If operations on integers are desired, + we can, + of course, + represent an integer as a floating point number; + and when the floating point operations satisfy the basic definitions in (9), + we know that all floating point operations will be exact, + provided that the operands and the answer can each be represented exactly with +\begin_inset Formula $p$ +\end_inset + + significant digits. + Therefore— +so long as we know that the numbers aren't too large— +we can add, + subtract, + or multiply integers with no inaccuracy due to rounding errors. +\end_layout + +\begin_layout Standard +But suppose that a programmer wants to determine if +\begin_inset Formula $m$ +\end_inset + + is an exact multiple of +\begin_inset Formula $n$ +\end_inset + +, + when +\begin_inset Formula $m$ +\end_inset + + and +\begin_inset Formula $n\neq0$ +\end_inset + + are integers. + Suppose further that a subroutine is available to calculate the quantity +\begin_inset Formula $\text{round}(u\bmod1)=u\mathring{\bmod}1$ +\end_inset + + for any given floating point number +\begin_inset Formula $u$ +\end_inset + +, + as in exercise 4.2.1–15. + One good way to determine whether or not +\begin_inset Formula $m$ +\end_inset + + is a multiple of +\begin_inset Formula $n$ +\end_inset + + might be to test whether or not +\begin_inset Formula $(m\oslash n)\mathring{\bmod}1=0$ +\end_inset + +, + using the assumed subroutine; + but perhaps rounding errors in the floating point calculations will invalidate this test in certain cases. +\end_layout + +\begin_layout Standard +Find suitable conditions on the range of integer values +\begin_inset Formula $n\neq0$ +\end_inset + + and +\begin_inset Formula $m$ +\end_inset + +, + such that +\begin_inset Formula $m$ +\end_inset + + is a multiple of +\begin_inset Formula $n$ +\end_inset + + if and only if +\begin_inset Formula $(m\oslash n)\mathring{\bmod}1=0$ +\end_inset + +. + In other words, + show that if +\begin_inset Formula $m$ +\end_inset + + and +\begin_inset Formula $n$ +\end_inset + + are not too large, + this test is valid. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +answer +\end_layout + +\end_inset + +A suitable condition would be +\begin_inset Formula $|m|<2b^{p-1}$ +\end_inset + +; + for the proof we may assume +\begin_inset Formula $m,n\geq0$ +\end_inset + + as the signs of the operands do not affect the check. + In every case, + if +\begin_inset Formula $n\mid m$ +\end_inset + +, + then +\begin_inset Formula $m\oslash n=\text{round}(\frac{m}{n})$ +\end_inset + + will necessarily be an integer and +\begin_inset Formula $(m\oslash n)\mathring{\bmod}1=0$ +\end_inset + +. + For the reciprocal, + if +\begin_inset Formula $n\nmid m$ +\end_inset + +, + then +\begin_inset Formula $m\oslash n=\frac{m}{n}+\delta$ +\end_inset + +, + where +\begin_inset Formula $\delta\leq\frac{1}{2}b^{e_{m\oslash n}-p-q}$ +\end_inset + +. + Note that the exponent is not increased after rounding in the division; + if it were, + that would mean that +\begin_inset Formula $b^{e}(1-\frac{1}{2}b^{-p})\leq\frac{m}{n}<b^{e}$ +\end_inset + + for some integer +\begin_inset Formula $e$ +\end_inset + +, + but then +\begin_inset Formula $nb^{e}(1-\frac{1}{2}b^{-p})\leq m<nb^{e}$ +\end_inset + + and, + because both +\begin_inset Formula $m$ +\end_inset + + and +\begin_inset Formula $nb^{e}$ +\end_inset + + are integers, + +\begin_inset Formula $\frac{1}{2}nb^{e-p}\geq1$ +\end_inset + +, + so +\begin_inset Formula $nb^{e}\geq2b^{p}$ +\end_inset + + and +\begin_inset Formula $m\geq2b^{p}-1>2b^{p-1}\#$ +\end_inset + +. + This means that +\begin_inset Formula $e_{m\oslash n}=\lfloor\log_{b}\frac{m}{n}\rfloor+1+q$ +\end_inset + +. + Now, + since +\begin_inset Formula $n\nmid m$ +\end_inset + +, + +\begin_inset Formula $\log_{b}\frac{m}{n}\notin\mathbb{Z}$ +\end_inset + +, + so +\begin_inset Formula $\lfloor\log_{b}\frac{m}{n}\rfloor+1=\lceil\log_{b}\frac{m}{n}\rceil=\lceil\log_{b}\frac{m}{2}-\log_{b}\frac{n}{2}\rceil\leq\lceil\log_{b}\frac{m}{2}\rceil-\lfloor\log_{b}\frac{n}{2}\rfloor$ +\end_inset + +. + With this, +\begin_inset Formula +\[ +\delta\leq\frac{1}{2}b^{\lceil\log_{b}\frac{m}{2}\rceil-\lfloor\log_{b}\frac{n}{2}\rfloor-p}\leq\frac{1}{2}b^{-1}b^{-\lfloor\log_{b}\frac{n}{2}\rfloor}<\frac{1}{2}b^{\cancel{-1}}b^{-(\log_{b}\frac{n}{2}\cancel{-1})}=\frac{1}{n}, +\] + +\end_inset + +but +\begin_inset Formula $\frac{m}{n}$ +\end_inset + + differs from the nearest integer by +\begin_inset Formula $\frac{1}{n}$ +\end_inset + + at most, + so +\begin_inset Formula $m\oslash n\notin\mathbb{Z}$ +\end_inset + + and +\begin_inset Formula $(m\oslash n)\mathring{\bmod}1\neq0$ +\end_inset + +. + This is assuming that there's no exponent underflow, + which would be rare because it would mean that +\begin_inset Formula $q<p-1$ +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +rexerc15[M24] +\end_layout + +\end_inset + +(H. + Björk.) Does the computed midpoint of an interval always lie between the endpoints? + (In other words, + does +\begin_inset Formula $u\leq v$ +\end_inset + + imply that +\begin_inset Formula $u\leq(u\oplus v)\oslash2\leq v$ +\end_inset + +? +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +answer +\end_layout + +\end_inset + +No. + For example, + if +\begin_inset Formula $b=10$ +\end_inset + +, + +\begin_inset Formula $p=5$ +\end_inset + +, + +\begin_inset Formula $u=5.9998$ +\end_inset + +, + and +\begin_inset Formula $v=5.9999$ +\end_inset + +, + then +\begin_inset Formula $u\oplus v=12.000$ +\end_inset + + and +\begin_inset Formula $(u\oplus v)\oslash2=6.0000>v$ +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +rexerc25[15] +\end_layout + +\end_inset + +When people speak about inaccuracy in floating point arithmetic they often ascribe errors to +\begin_inset Quotes eld +\end_inset + +cancellation +\begin_inset Quotes erd +\end_inset + + that occurs during the subtraction of nearly equal quantities. + But when +\begin_inset Formula $u$ +\end_inset + + and +\begin_inset Formula $v$ +\end_inset + + are approximately equal, + the difference +\begin_inset Formula $u\ominus v$ +\end_inset + + is obtained exactly, + with no error. + What do these people really mean? +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +answer +\end_layout + +\end_inset + +It really means that, + if the inputs carry a relative error due to rounding, + the relative error of the output is potentially much bigger. + Let +\begin_inset Formula $u_{0}$ +\end_inset + + be the +\begin_inset Quotes eld +\end_inset + +correct +\begin_inset Quotes erd +\end_inset + + value of +\begin_inset Formula $u$ +\end_inset + +, + that is, + the value it would have if the operations so far had been carried out with infinite precision, + and let +\begin_inset Formula $v_{0}$ +\end_inset + + be the +\begin_inset Quotes eld +\end_inset + +correct +\begin_inset Quotes erd +\end_inset + + value of +\begin_inset Formula $v$ +\end_inset + +, + similarly defined. + Let +\begin_inset Formula $u\eqqcolon u_{0}(1+\delta)$ +\end_inset + + and +\begin_inset Formula $v\eqqcolon v_{0}(1+\delta')$ +\end_inset + +. + Then, + if +\begin_inset Formula $u$ +\end_inset + + and +\begin_inset Formula $v$ +\end_inset + + are nearly equal and +\begin_inset Formula $u\ominus v$ +\end_inset + + is obtained exactly, + then +\begin_inset Formula +\[ +\frac{u\ominus v}{u_{0}-v_{0}}=\frac{u_{0}(1+\delta)-v_{0}(1+\delta')}{u_{0}-v_{0}}=1+\frac{u_{0}\delta-v_{0}\delta'}{u_{0}-v_{0}}. +\] + +\end_inset + +Even if +\begin_inset Formula $\delta$ +\end_inset + + and +\begin_inset Formula $\delta'$ +\end_inset + + are small, + the new relative error +\begin_inset Formula $\left|\frac{u_{0}\delta-v_{0}\delta'}{u_{0}-v_{0}}\right|$ +\end_inset + + can be quite big, + as there's no reason for +\begin_inset Formula $\delta$ +\end_inset + + and +\begin_inset Formula $\delta'$ +\end_inset + + to be similar. + In the worst case where +\begin_inset Formula $\delta=-\delta'$ +\end_inset + +, + the relative error of the inputs is multiplied by +\begin_inset Formula $\left|\frac{u_{0}+v_{0}}{u_{0}-v_{0}}\right|$ +\end_inset + +. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +rexerc29[M25] +\end_layout + +\end_inset + +Give an example to show that the condition +\begin_inset Formula $b^{p}\geq3$ +\end_inset + + is necessary in the previous exercise. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Plain Layout + + +\backslash +answer +\end_layout + +\end_inset + + +\begin_inset Note Greyedout +status open + +\begin_layout Plain Layout +(I had to look up the solution.) +\end_layout + +\end_inset + +Here +\begin_inset Formula $\text{round}(x)=2^{e}$ +\end_inset + + for some integer +\begin_inset Formula $e$ +\end_inset + + such that +\begin_inset Formula $|x-2^{e}|$ +\end_inset + + is lowest. + If +\begin_inset Formula $f(x)\coloneqq x^{99/100}$ +\end_inset + +, + then +\begin_inset Formula $g(y)\coloneqq y^{100/99}$ +\end_inset + +. + Now, + for integer +\begin_inset Formula $e$ +\end_inset + +, +\begin_inset Formula +\begin{multline*} +\text{round}(f(2^{e}))=\text{round}(2^{e\cdot99/100})<2^{e}\iff(2^{99/100})^{e}<\frac{3}{4}2^{e}\iff\\ +\iff(2^{-1/100})^{e}<\frac{3}{4}\iff e>41. +\end{multline*} + +\end_inset + +Conversely, +\begin_inset Formula +\begin{multline*} +\text{round}(g(2^{e}))=\text{round}(2^{e\cdot100/99})\leq2^{e}\iff(2^{100/99})^{e}<\frac{3}{2}2^{e}\iff\\ +\iff(2^{1/99})^{e}<\frac{3}{2}\iff e<58. +\end{multline*} + +\end_inset + +Thus, + if +\begin_inset Formula $e\in\{42,\dots,58\}$ +\end_inset + +, + +\begin_inset Formula $\hat{h}(2^{e})<2^{e}$ +\end_inset + +, + and it's easy to see that in fact +\begin_inset Formula $\hat{h}(2^{e})=2^{e-1}$ +\end_inset + +. + Thus +\begin_inset Formula $\hat{h}^{2}(2^{53})=2^{51}\neq2^{50}=\hat{h}^{3}(2^{53})$ +\end_inset + +. +\end_layout + +\end_body +\end_document diff --git a/vol2/index.lyx b/vol2/index.lyx index fba036c..22d5987 100644 --- a/vol2/index.lyx +++ b/vol2/index.lyx @@ -987,6 +987,18 @@ literal "false" \end_inset +\begin_inset Note Note +status open + +\begin_layout Plain Layout + +\family typewriter +A10+R25 +\end_layout + +\end_inset + + \end_layout \begin_layout Section @@ -1006,12 +1018,47 @@ literal "false" \end_inset +\begin_inset Note Note +status open + +\begin_layout Plain Layout + +\family typewriter +A10+R25 +\end_layout + +\end_inset + + \end_layout \begin_layout Subsection Accuracy of Floating Point Arithmetic \end_layout +\begin_layout Standard +\begin_inset CommandInset include +LatexCommand input +filename "4.2.2.lyx" +literal "false" + +\end_inset + + +\begin_inset Note Note +status open + +\begin_layout Plain Layout + +\family typewriter +A10+R25 +\end_layout + +\end_inset + + +\end_layout + \begin_layout Subsection Double-Precision Calculations \end_layout |
