#LyX 2.4 created this file. For more info see https://www.lyx.org/
\lyxformat 620
\begin_document
\begin_header
\save_transient_properties true
\origin unavailable
\textclass book
\begin_preamble
\input defs
\end_preamble
\use_default_options true
\maintain_unincluded_children no
\language english
\language_package default
\inputencoding utf8
\fontencoding auto
\font_roman "default" "default"
\font_sans "default" "default"
\font_typewriter "default" "default"
\font_math "auto" "auto"
\font_default_family default
\use_non_tex_fonts false
\font_sc false
\font_roman_osf false
\font_sans_osf false
\font_typewriter_osf false
\font_sf_scale 100 100
\font_tt_scale 100 100
\use_microtype false
\use_dash_ligatures true
\graphics default
\default_output_format default
\output_sync 0
\bibtex_command default
\index_command default
\float_placement class
\float_alignment class
\paperfontsize default
\spacing single
\use_hyperref false
\papersize default
\use_geometry false
\use_package amsmath 1
\use_package amssymb 1
\use_package cancel 1
\use_package esint 1
\use_package mathdots 1
\use_package mathtools 1
\use_package mhchem 1
\use_package stackrel 1
\use_package stmaryrd 1
\use_package undertilde 1
\cite_engine basic
\cite_engine_type default
\biblio_style plain
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\justification true
\use_refstyle 1
\use_formatted_ref 0
\use_minted 0
\use_lineno 0
\index Index
\shortcut idx
\color #008000
\end_index
\secnumdepth 3
\tocdepth 3
\paragraph_separation indent
\paragraph_indentation default
\is_math_indent 0
\math_numbering_side default
\quotes_style english
\dynamic_quotes 0
\papercolumns 1
\papersides 1
\paperpagestyle default
\tablestyle default
\tracking_changes false
\output_changes false
\change_bars false
\postpone_fragile_content false
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\docbook_table_output 0
\docbook_mathml_prefix 1
\end_header

\begin_body

\begin_layout Standard

\emph on
Note:

\emph default
 Normalized floating point arithmetic is assumed unless the contrary is specified.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
exerc4[10]
\end_layout

\end_inset

Is it possible to have floating point numbers 
\begin_inset Formula $u$
\end_inset

,
 
\begin_inset Formula $v$
\end_inset

,
 and 
\begin_inset Formula $w$
\end_inset

 for which exponent overflow occurs during the calculation of 
\begin_inset Formula $u\otimes(v\otimes w)$
\end_inset

 but not during the calculation of 
\begin_inset Formula $(u\otimes v)\otimes w$
\end_inset

?
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
answer 
\end_layout

\end_inset

Yes.
 If,
 say,
 
\begin_inset Formula $b=10$
\end_inset

,
 
\begin_inset Formula $q=8$
\end_inset

,
 and overflow occurs when the exponent reaches 16,
 let 
\begin_inset Formula $u=(15,.10000001)$
\end_inset

,
 
\begin_inset Formula $v=(9,.33333330)$
\end_inset

,
 and 
\begin_inset Formula $w=(9,.30000000)$
\end_inset

.
 Then 
\begin_inset Formula $v\otimes w=(9,.99999990)$
\end_inset

 and 
\begin_inset Formula $u\otimes(v\otimes w)=(16,.10000000)$
\end_inset

,
 which raises an overflow,
 but 
\begin_inset Formula $u\otimes v=(15,.33333333)$
\end_inset

 and 
\begin_inset Formula $(u\otimes v)\otimes w=(15,.99999999)$
\end_inset

,
 which does not raise an overflow.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
rexerc8[20]
\end_layout

\end_inset

Let 
\begin_inset Formula $\epsilon=0.0001$
\end_inset

;
 which of the relations
\begin_inset Formula 
\begin{align*}
u & \prec v\quad(\epsilon), & u & \sim v\quad(\epsilon), & u & \succ v\quad(\epsilon), & u & \cong v\quad(\epsilon)
\end{align*}

\end_inset

hold for the following pairs of base 10,
 excess 0,
 eight-digit floating point numbers?
\end_layout

\begin_layout Enumerate
\begin_inset Formula $u=(1,+.31415927)$
\end_inset

,
 
\begin_inset Formula $v=(1,+.31416000)$
\end_inset

;
\end_layout

\begin_layout Enumerate
\begin_inset Formula $u=(0,+.99997000)$
\end_inset

,
 
\begin_inset Formula $v=(1,+.10000039)$
\end_inset

;
\end_layout

\begin_layout Enumerate
\begin_inset Formula $u=(24,+.60221400)$
\end_inset

,
 
\begin_inset Formula $v=(27,+.00060221)$
\end_inset

;
\end_layout

\begin_layout Enumerate
\begin_inset Formula $u=(24,+.60221400)$
\end_inset

,
 
\begin_inset Formula $v=(31,+.00000006)$
\end_inset

;
\end_layout

\begin_layout Enumerate
\begin_inset Formula $u=(24,+.60221400)$
\end_inset

,
 
\begin_inset Formula $v=(28,+.00000000)$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
answer
\end_layout

\end_inset


\end_layout

\begin_layout Enumerate
\begin_inset Formula $\sim$
\end_inset

,
 
\begin_inset Formula $\approx$
\end_inset

.
\end_layout

\begin_layout Enumerate
\begin_inset Formula $\sim$
\end_inset

,
 
\begin_inset Formula $\approx$
\end_inset

.
\end_layout

\begin_layout Enumerate
\begin_inset Formula $\sim$
\end_inset

,
 
\begin_inset Formula $\approx$
\end_inset

.
\end_layout

\begin_layout Enumerate
\begin_inset Formula $\sim$
\end_inset

.
\end_layout

\begin_layout Enumerate
\begin_inset Formula $\sim$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
rexerc10[M25]
\end_layout

\end_inset

(W.
 M.
 Kahan.) A certain computer performs floating point arithmetic without proper rounding,
 and,
 in fact,
 its floating point multiplication routine ignores all but the first 
\begin_inset Formula $p$
\end_inset

 most significant digits of the 
\begin_inset Formula $2p$
\end_inset

-digit product 
\begin_inset Formula $f_{u}f_{v}$
\end_inset

.
 (Thus when 
\begin_inset Formula $f_{u}f_{v}<1/b$
\end_inset

,
 the least-significant digit of 
\begin_inset Formula $u\otimes v$
\end_inset

 always comes out to be zero,
 due to subsequent normalization.) Show that this causes the monotonicity of multiplication to fail;
 in other words,
 exhibit positive normalized floating point numbers 
\begin_inset Formula $u$
\end_inset

,
 
\begin_inset Formula $v$
\end_inset

,
 and 
\begin_inset Formula $w$
\end_inset

 such that 
\begin_inset Formula $u<v$
\end_inset

 but 
\begin_inset Formula $u\otimes w>v\otimes w$
\end_inset

 on this machine.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
answer 
\end_layout

\end_inset

Assume 
\begin_inset Formula $p=4$
\end_inset

,
 and let 
\begin_inset Formula $u=.9999<1.000=v$
\end_inset

 and let 
\begin_inset Formula $w=.2222$
\end_inset

.
 Then 
\begin_inset Formula $u\otimes w=.2221>.2220=v\otimes w$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
rexerc13[M25]
\end_layout

\end_inset

Some programming languages (and even some computers) make use of floating point arithmetic only,
 with no provision for exact calculations with integers.
 If operations on integers are desired,
 we can,
 of course,
 represent an integer as a floating point number;
 and when the floating point operations satisfy the basic definitions in (9),
 we know that all floating point operations will be exact,
 provided that the operands and the answer can each be represented exactly with 
\begin_inset Formula $p$
\end_inset

 significant digits.
 Therefore—
so long as we know that the numbers aren't too large—
we can add,
 subtract,
 or multiply integers with no inaccuracy due to rounding errors.
\end_layout

\begin_layout Standard
But suppose that a programmer wants to determine if 
\begin_inset Formula $m$
\end_inset

 is an exact multiple of 
\begin_inset Formula $n$
\end_inset

,
 when 
\begin_inset Formula $m$
\end_inset

 and 
\begin_inset Formula $n\neq0$
\end_inset

 are integers.
 Suppose further that a subroutine is available to calculate the quantity 
\begin_inset Formula $\text{round}(u\bmod1)=u\mathring{\bmod}1$
\end_inset

 for any given floating point number 
\begin_inset Formula $u$
\end_inset

,
 as in exercise 4.2.1–15.
 One good way to determine whether or not 
\begin_inset Formula $m$
\end_inset

 is a multiple of 
\begin_inset Formula $n$
\end_inset

 might be to test whether or not 
\begin_inset Formula $(m\oslash n)\mathring{\bmod}1=0$
\end_inset

,
 using the assumed subroutine;
 but perhaps rounding errors in the floating point calculations will invalidate this test in certain cases.
\end_layout

\begin_layout Standard
Find suitable conditions on the range of integer values 
\begin_inset Formula $n\neq0$
\end_inset

 and 
\begin_inset Formula $m$
\end_inset

,
 such that 
\begin_inset Formula $m$
\end_inset

 is a multiple of 
\begin_inset Formula $n$
\end_inset

 if and only if 
\begin_inset Formula $(m\oslash n)\mathring{\bmod}1=0$
\end_inset

.
 In other words,
 show that if 
\begin_inset Formula $m$
\end_inset

 and 
\begin_inset Formula $n$
\end_inset

 are not too large,
 this test is valid.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
answer 
\end_layout

\end_inset

A suitable condition would be 
\begin_inset Formula $|m|<2b^{p-1}$
\end_inset

;
 for the proof we may assume 
\begin_inset Formula $m,n\geq0$
\end_inset

 as the signs of the operands do not affect the check.
 In every case,
 if 
\begin_inset Formula $n\mid m$
\end_inset

,
 then 
\begin_inset Formula $m\oslash n=\text{round}(\frac{m}{n})$
\end_inset

 will necessarily be an integer and 
\begin_inset Formula $(m\oslash n)\mathring{\bmod}1=0$
\end_inset

.
 For the reciprocal,
 if 
\begin_inset Formula $n\nmid m$
\end_inset

,
 then 
\begin_inset Formula $m\oslash n=\frac{m}{n}+\delta$
\end_inset

,
 where 
\begin_inset Formula $\delta\leq\frac{1}{2}b^{e_{m\oslash n}-p-q}$
\end_inset

.
 Note that the exponent is not increased after rounding in the division;
 if it were,
 that would mean that 
\begin_inset Formula $b^{e}(1-\frac{1}{2}b^{-p})\leq\frac{m}{n}<b^{e}$
\end_inset

 for some integer 
\begin_inset Formula $e$
\end_inset

,
 but then 
\begin_inset Formula $nb^{e}(1-\frac{1}{2}b^{-p})\leq m<nb^{e}$
\end_inset

 and,
 because both 
\begin_inset Formula $m$
\end_inset

 and 
\begin_inset Formula $nb^{e}$
\end_inset

 are integers,
 
\begin_inset Formula $\frac{1}{2}nb^{e-p}\geq1$
\end_inset

,
 so 
\begin_inset Formula $nb^{e}\geq2b^{p}$
\end_inset

 and 
\begin_inset Formula $m\geq2b^{p}-1>2b^{p-1}\#$
\end_inset

.
 This means that 
\begin_inset Formula $e_{m\oslash n}=\lfloor\log_{b}\frac{m}{n}\rfloor+1+q$
\end_inset

.
 Now,
 since 
\begin_inset Formula $n\nmid m$
\end_inset

,
 
\begin_inset Formula $\log_{b}\frac{m}{n}\notin\mathbb{Z}$
\end_inset

,
 so 
\begin_inset Formula $\lfloor\log_{b}\frac{m}{n}\rfloor+1=\lceil\log_{b}\frac{m}{n}\rceil=\lceil\log_{b}\frac{m}{2}-\log_{b}\frac{n}{2}\rceil\leq\lceil\log_{b}\frac{m}{2}\rceil-\lfloor\log_{b}\frac{n}{2}\rfloor$
\end_inset

.
 With this,
\begin_inset Formula 
\[
\delta\leq\frac{1}{2}b^{\lceil\log_{b}\frac{m}{2}\rceil-\lfloor\log_{b}\frac{n}{2}\rfloor-p}\leq\frac{1}{2}b^{-1}b^{-\lfloor\log_{b}\frac{n}{2}\rfloor}<\frac{1}{2}b^{\cancel{-1}}b^{-(\log_{b}\frac{n}{2}\cancel{-1})}=\frac{1}{n},
\]

\end_inset

but 
\begin_inset Formula $\frac{m}{n}$
\end_inset

 differs from the nearest integer by 
\begin_inset Formula $\frac{1}{n}$
\end_inset

 at most,
 so 
\begin_inset Formula $m\oslash n\notin\mathbb{Z}$
\end_inset

 and 
\begin_inset Formula $(m\oslash n)\mathring{\bmod}1\neq0$
\end_inset

.
 This is assuming that there's no exponent underflow,
 which would be rare because it would mean that 
\begin_inset Formula $q<p-1$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
rexerc15[M24]
\end_layout

\end_inset

(H.
 Björk.) Does the computed midpoint of an interval always lie between the endpoints?
 (In other words,
 does 
\begin_inset Formula $u\leq v$
\end_inset

 imply that 
\begin_inset Formula $u\leq(u\oplus v)\oslash2\leq v$
\end_inset

?
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
answer 
\end_layout

\end_inset

No.
 For example,
 if 
\begin_inset Formula $b=10$
\end_inset

,
 
\begin_inset Formula $p=5$
\end_inset

,
 
\begin_inset Formula $u=5.9998$
\end_inset

,
 and 
\begin_inset Formula $v=5.9999$
\end_inset

,
 then 
\begin_inset Formula $u\oplus v=12.000$
\end_inset

 and 
\begin_inset Formula $(u\oplus v)\oslash2=6.0000>v$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
rexerc25[15]
\end_layout

\end_inset

When people speak about inaccuracy in floating point arithmetic they often ascribe errors to 
\begin_inset Quotes eld
\end_inset

cancellation
\begin_inset Quotes erd
\end_inset

 that occurs during the subtraction of nearly equal quantities.
 But when 
\begin_inset Formula $u$
\end_inset

 and 
\begin_inset Formula $v$
\end_inset

 are approximately equal,
 the difference 
\begin_inset Formula $u\ominus v$
\end_inset

 is obtained exactly,
 with no error.
 What do these people really mean?
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
answer 
\end_layout

\end_inset

It really means that,
 if the inputs carry a relative error due to rounding,
 the relative error of the output is potentially much bigger.
 Let 
\begin_inset Formula $u_{0}$
\end_inset

 be the 
\begin_inset Quotes eld
\end_inset

correct
\begin_inset Quotes erd
\end_inset

 value of 
\begin_inset Formula $u$
\end_inset

,
 that is,
 the value it would have if the operations so far had been carried out with infinite precision,
 and let 
\begin_inset Formula $v_{0}$
\end_inset

 be the 
\begin_inset Quotes eld
\end_inset

correct
\begin_inset Quotes erd
\end_inset

 value of 
\begin_inset Formula $v$
\end_inset

,
 similarly defined.
 Let 
\begin_inset Formula $u\eqqcolon u_{0}(1+\delta)$
\end_inset

 and 
\begin_inset Formula $v\eqqcolon v_{0}(1+\delta')$
\end_inset

.
 Then,
 if 
\begin_inset Formula $u$
\end_inset

 and 
\begin_inset Formula $v$
\end_inset

 are nearly equal and 
\begin_inset Formula $u\ominus v$
\end_inset

 is obtained exactly,
 then 
\begin_inset Formula 
\[
\frac{u\ominus v}{u_{0}-v_{0}}=\frac{u_{0}(1+\delta)-v_{0}(1+\delta')}{u_{0}-v_{0}}=1+\frac{u_{0}\delta-v_{0}\delta'}{u_{0}-v_{0}}.
\]

\end_inset

Even if 
\begin_inset Formula $\delta$
\end_inset

 and 
\begin_inset Formula $\delta'$
\end_inset

 are small,
 the new relative error 
\begin_inset Formula $\left|\frac{u_{0}\delta-v_{0}\delta'}{u_{0}-v_{0}}\right|$
\end_inset

 can be quite big,
 as there's no reason for 
\begin_inset Formula $\delta$
\end_inset

 and 
\begin_inset Formula $\delta'$
\end_inset

 to be similar.
 In the worst case where 
\begin_inset Formula $\delta=-\delta'$
\end_inset

,
 the relative error of the inputs is multiplied by 
\begin_inset Formula $\left|\frac{u_{0}+v_{0}}{u_{0}-v_{0}}\right|$
\end_inset

.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
rexerc29[M25]
\end_layout

\end_inset

Give an example to show that the condition 
\begin_inset Formula $b^{p}\geq3$
\end_inset

 is necessary in the previous exercise.
\end_layout

\begin_layout Standard
\begin_inset ERT
status open

\begin_layout Plain Layout


\backslash
answer 
\end_layout

\end_inset


\begin_inset Note Greyedout
status open

\begin_layout Plain Layout
(I had to look up the solution.) 
\end_layout

\end_inset

Here 
\begin_inset Formula $\text{round}(x)=2^{e}$
\end_inset

 for some integer 
\begin_inset Formula $e$
\end_inset

 such that 
\begin_inset Formula $|x-2^{e}|$
\end_inset

 is lowest.
 If 
\begin_inset Formula $f(x)\coloneqq x^{99/100}$
\end_inset

,
 then 
\begin_inset Formula $g(y)\coloneqq y^{100/99}$
\end_inset

.
 Now,
 for integer 
\begin_inset Formula $e$
\end_inset

,
\begin_inset Formula 
\begin{multline*}
\text{round}(f(2^{e}))=\text{round}(2^{e\cdot99/100})<2^{e}\iff(2^{99/100})^{e}<\frac{3}{4}2^{e}\iff\\
\iff(2^{-1/100})^{e}<\frac{3}{4}\iff e>41.
\end{multline*}

\end_inset

Conversely,
\begin_inset Formula 
\begin{multline*}
\text{round}(g(2^{e}))=\text{round}(2^{e\cdot100/99})\leq2^{e}\iff(2^{100/99})^{e}<\frac{3}{2}2^{e}\iff\\
\iff(2^{1/99})^{e}<\frac{3}{2}\iff e<58.
\end{multline*}

\end_inset

Thus,
 if 
\begin_inset Formula $e\in\{42,\dots,58\}$
\end_inset

,
 
\begin_inset Formula $\hat{h}(2^{e})<2^{e}$
\end_inset

,
 and it's easy to see that in fact 
\begin_inset Formula $\hat{h}(2^{e})=2^{e-1}$
\end_inset

.
 Thus 
\begin_inset Formula $\hat{h}^{2}(2^{53})=2^{51}\neq2^{50}=\hat{h}^{3}(2^{53})$
\end_inset

.
\end_layout

\end_body
\end_document