Editing Floating-point arithmetic (section)

== References ==
{{reflist|30em|refs=
<ref name="Smith_1997">{{cite book |last=Smith |first=Steven W. |title=The Scientist and Engineer's Guide to Digital Signal Processing |chapter-url=http://www.dspguide.com/ch28/4.htm |access-date=2012-12-31 |date=1997 |publisher=California Technical Pub |isbn=978-0-9660176-3-2 |page=514 |chapter=Chapter 28, Fixed versus Floating Point}}</ref>
<ref name="Rojas_1997">{{cite journal |title=Konrad Zuse's Legacy: The Architecture of the Z1 and Z3 |last=Rojas |first=Raúl |author-link=Raúl Rojas |journal=[[IEEE Annals of the History of Computing]] |volume=19 |number=2 |date=April–June 1997 |pages=5–16 |doi=10.1109/85.586067 |url=http://ed-thelen.org/comp-hist/Zuse_Z1_and_Z3.pdf |access-date=2022-07-03 |url-status=live |archive-url=https://web.archive.org/web/20220703082408/http://ed-thelen.org/comp-hist/Zuse_Z1_and_Z3.pdf |archive-date=2022-07-03}} (12 pages)</ref>
<ref name="Rojas_2014">{{cite arXiv |eprint=1406.1886 |title=The Z1: Architecture and Algorithms of Konrad Zuse's First Computer |first=Raúl |last=Rojas |author-link=Raúl Rojas |date=2014-06-07|class=cs.AR }}</ref>
<ref name="Kahan_1997_JVNL">{{cite web |url=https://people.eecs.berkeley.edu/~wkahan/SIAMjvnl.pdf |archive-url=https://web.archive.org/web/20080905103125/http://www.cs.berkeley.edu/~wkahan/SIAMjvnl.pdf |archive-date=2008-09-05 |url-status=live |title=The Baleful Effect of Computer Languages and Benchmarks upon Applied Mathematics, Physics and Chemistry. John von Neumann Lecture |first=William Morton |last=Kahan |author-link=William Morton Kahan |date=1997-07-15 |page=3}}</ref>
<ref name="Randell_1982_2">{{cite book |editor-last=Randell |editor-first=Brian |editor-link=Brian Randell |title=The Origins of Digital Computers: Selected Papers |edition=3rd |publisher=[[Springer-Verlag]] |date=1982 |orig-date=1973 |location=Berlin; New York |page=244 |isbn=978-3-540-11319-5}}</ref>
<ref name="Kahan_2001_JavaHurt">{{cite web |first1=William Morton |last1=Kahan |author-link1=William Morton Kahan |first2=Joseph |last2=Darcy |date=2001 |orig-date=1998-03-01 |url=https://people.eecs.berkeley.edu/~wkahan/JAVAhurt.pdf |archive-url=https://web.archive.org/web/20000816043653/http://www.cs.berkeley.edu/~wkahan/JAVAhurt.pdf |archive-date=2000-08-16 |url-status=live |title=How Java's floating-point hurts everyone everywhere |access-date=2003-09-05}}</ref>
<ref name="Kahan_2004">{{cite web |url=https://people.eecs.berkeley.edu/~wkahan/Qdrtcs.pdf |archive-url=https://web.archive.org/web/20060525111157/http://www.cs.berkeley.edu/~wkahan/Qdrtcs.pdf |archive-date=2006-05-25 |url-status=live |first=William Morton |last=Kahan |author-link=William Morton Kahan |title=On the Cost of Floating-Point Computation Without Extra-Precise Arithmetic |date=2004-11-20 |access-date=2012-02-19}}</ref>
<ref name="Kahan_2006_Mindless">{{cite web |url=https://people.eecs.berkeley.edu/~wkahan/Mindless.pdf |archive-url=https://web.archive.org/web/20041221020332/http://www.cs.berkeley.edu/~wkahan/Mindless.pdf |archive-date=2004-12-21 |url-status=live |title=How Futile are Mindless Assessments of Roundoff in Floating-Point Computation? |first=William Morton |last=Kahan |author-link=William Morton Kahan |date=2006-01-11}}</ref>
<ref name="Severance_1998">{{cite web |url=https://people.eecs.berkeley.edu/~wkahan/ieee754status/754story.html |title=An Interview with the Old Man of Floating-Point |first=Charles |last=Severance |author-link=Charles Severance (computer scientist) |date=1998-02-20}}</ref>
<ref name="Kahan_1997_Status">{{cite web |url=https://people.eecs.berkeley.edu/~wkahan/ieee754status/IEEE754.PDF |archive-url=https://web.archive.org/web/20020622093102/http://www.cs.berkeley.edu/~wkahan/ieee754status/IEEE754.PDF |archive-date=2002-06-22 |url-status=live |title=Lecture Notes on the Status of IEEE Standard 754 for Binary Floating-Point Arithmetic |first=William Morton |last=Kahan |author-link=William Morton Kahan |date=1997-10-01 |page=9}}</ref>
<ref name="Intel">{{cite book |chapter-url=http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html |title=Intel 64 and IA-32 Architectures Software Developers' Manuals |volume=1 |chapter=D.3.2.1}}</ref>
<ref name="Higham_2002">{{cite book |title=Accuracy and Stability of Numerical Algorithms |edition=2nd |first=Nicholas John |author-link=Nicholas Higham |last=Higham |publisher=[[Society for Industrial and Applied Mathematics]] (SIAM) |date=2002 |isbn=978-0-89871-521-7 |id=0-89871-355-2 |pages=27–28, 110–123, 493 |url=https://books.google.com/books?id=epilvM5MMxwC}}</ref>
<ref name="OpenEXR">{{cite web |url=http://www.openexr.com/about.html |title=openEXR |publisher=openEXR |access-date=2012-04-25 |archive-date=2013-05-08  |archive-url=https://web.archive.org/web/20130508221152/http://www.openexr.com/about.html |url-status=dead |quote=Since the IEEE-754 floating-point specification does not define a 16-bit format, ILM created the "half" format. Half values have 1 sign bit, 5 exponent bits, and 10 mantissa bits.}}</ref><!-- Note: As this can be deduced from the documentation, "10 mantissa bits" excludes the implicit bit. So this 16-bit format was not in IEEE 754 at its creation, but it actually corresponds to the now existing binary16 format. -->
<ref name="OpenEXR-half">{{cite web |url=https://openexr.com/en/latest/TechnicalIntroduction.html#the-half-data-type |title=Technical Introduction to OpenEXR – The half Data Type |publisher=openEXR |access-date=2024-04-16}}</ref>
<ref name="IEEE-754_Analysis">{{cite web|url=https://christophervickery.com/IEEE-754/|title=IEEE-754 Analysis|access-date=2024-08-29}}</ref>
<ref name="Goldberg_1991">{{cite journal |first=David |last=Goldberg |author-link=David Goldberg (PARC) |title=What Every Computer Scientist Should Know About Floating-Point Arithmetic |journal=[[ACM Computing Surveys]] |date=March 1991 |volume=23 |issue=1 |pages=5–48 |doi=10.1145/103162.103163 |doi-access=free |s2cid=222008826}} (With the addendum "Differences Among IEEE 754 Implementations": [https://web.archive.org/web/20171011072644/http://www.cse.msu.edu/~cse320/Documents/FloatingPoint.pdf], [https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html])</ref>
<ref name="Harris">{{Cite journal |title=You're Going To Have To Think! |first=Richard |last=Harris |journal=[[Overload (magazine)|Overload]] |issue=99 |date=October 2010 |issn=1354-3172 |pages=5–10 |url=http://accu.org/index.php/journals/1702 |access-date=2011-09-24 |quote=Far more worrying is cancellation error which can yield catastrophic loss of precision.}} [http://accu.org/var/uploads/journals/overload99.pdf]</ref>
<ref name="GAO report IMTEC 92-26">{{cite web |url=http://www.gao.gov/products/IMTEC-92-26 |title=Patriot missile defense, Software problem led to system failure at Dharhan, Saudi Arabia |id=GAO report IMTEC 92-26 |publisher=[[US Government Accounting Office]]}}</ref>
<ref name="Skeel">{{citation |url=https://www-users.cse.umn.edu/~arnold/disasters/Patriot-dharan-skeel-siam.pdf |title=Roundoff Error and the Patriot Missile |last=Skeel |first=Robert |journal=SIAM News |volume=25 |issue=4 |page=11 |date=July 1992 |access-date=2024-11-15}}</ref>
<ref name="RalstonReilly2003">{{cite book |first=James Hardy |last=Wilkinson |author-link=James Hardy Wilkinson |editor-first1=Anthony |editor-last1=Ralston |editor-first2=Edwin D. |editor-last2=Reilly |editor-first3=David |editor-last3=Hemmendinger |chapter=Error Analysis |title=Encyclopedia of Computer Science |pages=669–674 |url=https://books.google.com/books?id=OLRwQgAACAAJ |access-date=2013-05-14 |date=2003-09-08 |publisher=[[Wiley (publisher)|Wiley]] |isbn=978-0-470-86412-8}}</ref>
<ref name="Einarsson_2005">{{cite book |first=Bo |last=Einarsson |title=Accuracy and reliability in scientific computing |url=https://books.google.com/books?id=sh4orx_qB_QC&pg=PA50 |access-date=2013-05-14 |date=2005 |publisher=[[Society for Industrial and Applied Mathematics]] (SIAM) |isbn=978-0-89871-815-7 |pages=50–}}</ref>
<ref name="Kahan_2005_ARITH17">{{cite conference |first=William Morton |last=Kahan |author-link=William Morton Kahan |date=2005-07-15 |title=Floating-Point Arithmetic Besieged by "Business Decisions" |type=Keynote Address |conference=IEEE-sponsored [[ARITH 17]], Symposium on Computer Arithmetic |pages=6, 18 |url=https://people.eecs.berkeley.edu/~wkahan/ARITH_17.pdf |access-date=2013-05-23 |url-status=live |archive-url=https://web.archive.org/web/20060317103619/http://www.cs.berkeley.edu/~wkahan/ARITH_17.pdf |archive-date=2006-03-17}} (NB. Kahan estimates that the incidence of excessively inaccurate results near singularities is reduced by a factor of approx. 1/2000 using the 11 extra bits of precision of [[extended precision|double extended]].)</ref>
<ref name="OliveiraStewart_2006">{{cite book |first1=Suely |last1=Oliveira |first2=David E. |last2=Stewart |title=Writing Scientific Software: A Guide to Good Style |url=https://books.google.com/books?id=E6a8oZOS8noC&pg=PA10 |date=2006-09-07 |publisher=[[Cambridge University Press]] |isbn=978-1-139-45862-7 |pages=10–}}</ref>
<ref name="Kahan_2011_Debug">{{cite conference |url=https://people.eecs.berkeley.edu/~wkahan/Boulder.pdf |archive-url=https://web.archive.org/web/20130620140729/http://www.eecs.berkeley.edu/~wkahan/Boulder.pdf |archive-date=2013-06-20 |url-status=live |title=Desperately Needed Remedies for the Undebuggability of Large Floating-Point Computations in Science and Engineering |conference=IFIP/SIAM/NIST Working Conference on Uncertainty Quantification in Scientific Computing, Boulder, CO |page=33 |first=William Morton |last=Kahan |author-link=William Morton Kahan |date=2011-08-03}}</ref>
<ref name="Kahan_1981_WhyIEEE">{{cite web |url=https://people.eecs.berkeley.edu/~wkahan/ieee754status/why-ieee.pdf |archive-url=https://web.archive.org/web/20041204070746/http://www.cs.berkeley.edu/~wkahan/ieee754status/why-ieee.pdf |archive-date=2004-12-04 |url-status=live |title=Why do we need a floating-point arithmetic standard? |page=26 |first=William Morton |last=Kahan |author-link=William Morton Kahan |date=1981-02-12}}</ref>
<ref name="Kahan_2001_LN">{{cite web|url=http://www.cims.nyu.edu/~dbindel/class/cs279/notes-06-04.pdf |archive-url=https://web.archive.org/web/20130517181356/http://www.cims.nyu.edu/~dbindel/class/cs279/notes-06-04.pdf |archive-date=2013-05-17 |url-status=live |first=William Morton |last=Kahan |author-link=William Morton Kahan |editor-first=David |editor-last=Bindel |title=Lecture notes of System Support for Scientific Computation |date=2001-06-04}}</ref>
<ref name="Speleotrove_2012">{{cite web |url=https://speleotrove.com/decimal/ |title=General Decimal Arithmetic |publisher=Speleotrove.com |access-date=2012-04-25}}</ref>
<ref name="Kahan_2000_Marketing">{{cite web |url=https://people.eecs.berkeley.edu/~wkahan/MktgMath.pdf |archive-url=https://web.archive.org/web/20030815150333/http://www.cs.berkeley.edu/~wkahan/MktgMath.pdf |archive-date=2003-08-15 |url-status=live |title=Marketing versus Mathematics |pages=15, 35, 47 |first=William Morton |last=Kahan |author-link=William Morton Kahan |date=2000-08-27}}</ref>
<ref name="Shewchuk">{{cite journal | title=Adaptive Precision Floating-Point Arithmetic and Fast Robust Geometric Predicates | journal=[[Discrete & Computational Geometry]] | volume=18 | pages=305–363 |first=Jonathan Richard |last=Shewchuk |date=1997 | issue=3 | doi=10.1007/PL00009321 | doi-access=free}}</ref>
<ref name="Christiansen_Perl">{{cite web |url=https://perldoc.perl.org/5.8.8/perlfaq4#Why-is-int()-broken? |title=perlfaq4 / Why is int() broken? |first1=Tom |last1=Christiansen |first2=Nathan |last2=Torkington |date=2006 |publisher=perldoc.perl.org |access-date=2011-01-11 |display-authors=etal}}</ref>
<ref name="Kahan_1997_Cantilever">{{cite web |url=https://people.eecs.berkeley.edu/~wkahan/Cantilever.pdf |archive-url=https://web.archive.org/web/20031205191038/http://www.cs.berkeley.edu/~wkahan/Cantilever.pdf |archive-date=2003-12-05 |url-status=live |title=Roundoff Degrades an Idealized Cantilever |first1=William Morton |last1=Kahan |author-link1=William Morton Kahan |first2=Melody Y. |last2=Ivory |date=1997-07-03}}</ref>
<ref name="Muller_2010">{{cite book |last1=Muller |first1=Jean-Michel |last2=Brisebarre |first2=Nicolas |last3=de Dinechin |first3=Florent |last4=Jeannerod |first4=Claude-Pierre |last5=Lefèvre |first5=Vincent |last6=Melquiond |first6=Guillaume |last7=Revol |first7=Nathalie|author7-link=Nathalie Revol |last8=Stehlé |first8=Damien |last9=Torres |first9=Serge |title=Handbook of Floating-Point Arithmetic<!-- |chapter=Chapter 2, Definitions and Basic Notions--> |date=2010 |publisher=[[Birkhäuser]] |edition=1st |isbn=978-0-8176-4704-9<!-- print --> |doi=10.1007/978-0-8176-4705-6 |lccn=2009939668 <!-- |page=16 --> |ref=muller_et_al_pg_16 |url=https://books.google.com/books?id=baFvrIOPvncC&pg=PA16}}</ref>
<ref name="Savard_2018">{{citation |title=The Decimal Floating-Point Standard |first=John J. G. |last=Savard |date=2018 |orig-date=2007 |work=quadibloc |url=http://www.quadibloc.com/comp/cp020302.htm |access-date=2018-07-16 |url-status=live |archive-url=https://web.archive.org/web/20180703002322/http://www.quadibloc.com/comp/cp020302.htm |archive-date=2018-07-03}}</ref>
<ref name="Zehendner_2008">{{cite web |type=Lecture script |title=Rechnerarithmetik: Fest- und Gleitkommasysteme |date=Summer 2008 |first=Eberhard |last=Zehendner |language=de |publisher=[[Friedrich-Schiller-Universität Jena]] |page=2 |url=https://users.fmi.uni-jena.de/~nez/rechnerarithmetik_5/folien/Rechnerarithmetik.2008.05.handout.pdf |access-date=2018-08-07 |url-status=live |archive-url=https://web.archive.org/web/20180807062449/https://users.fmi.uni-jena.de/~nez/rechnerarithmetik_5/folien/Rechnerarithmetik.2008.05.handout.pdf |archive-date=2018-08-07}} [https://web.archive.org/web/20180806175620/https://users.fmi.uni-jena.de/~nez/rechnerarithmetik_5/folien/Rechnerarithmetik.2008.komplett.pdf] (NB. This reference incorrectly gives the MANIAC II's floating point base as 256, whereas it actually is 65536.)</ref>
<ref name="Lazarus_1956">{{cite web |title=MANIAC II |first=Roger B. |last=Lazarus |date=1957-01-30 |orig-date=1956-10-01 |publisher=Los Alamos Scientific Laboratory of the University of California |location=Los Alamos, NM, USA |id=LA-2083 |page=14 |url=http://bitsavers.org/pdf/lanl/LA-2083_MANIAC_II_Oct56.pdf |access-date=2018-08-07 |archive-url=https://web.archive.org/web/20180807200914/http://bitsavers.org/pdf/lanl/LA-2083_MANIAC_II_Oct56.pdf |archive-date=2018-08-07 |url-status=live |quote=[…] the Maniac's floating base, which is 2<sup>16</sup> = 65,536. […] The Maniac's large base permits a considerable increase in the speed of floating point arithmetic. Although such a large base implies the possibility of as many as 15 lead zeros, the large word size of 48 bits guarantees adequate significance. […]}}</ref>
<ref name="Beebe_2017">{{cite book |first=Nelson H. F. |last=Beebe |title=The Mathematical-Function Computation Handbook - Programming Using the MathCW Portable Software Library |chapter=Chapter H. Historical floating-point architectures |date=2017-08-22 |location=Salt Lake City, UT, USA |publisher=[[Springer International Publishing AG]] |edition=1st |lccn=2017947446 |isbn=978-3-319-64109-6 |doi=10.1007/978-3-319-64110-2 |page=948|s2cid=30244721 }}</ref>
<ref name="Parkinson_2000">{{cite book |title=High Resolution Site Surveys |first=Roger |last=Parkinson |publisher=[[CRC Press]] |date=2000-12-07 |chapter=Chapter 2 - High resolution digital site survey systems - Chapter 2.1 - Digital field recording systems |edition=1st |isbn=978-0-20318604-6 |page=24 |chapter-url=https://books.google.com/books?id=Ocip5vpLD4wC&pg=PA24 |access-date=2019-08-18 |quote=[…] Systems such as the [Digital Field System] DFS IV and DFS V were quaternary floating-point systems and used gain steps of 12&nbsp;dB. […]}} (256 pages)</ref>
<ref name="MSVC">{{cite web |url=https://learn.microsoft.com/en-us/cpp/build/ieee-floating-point-representation |title=IEEE Floating-Point Representation |date=2021-08-03}}</ref>
<ref name="GCC">[https://gcc.gnu.org/onlinedocs/gcc/i386-and-x86-64-Options.html Using the GNU Compiler Collection, i386 and x86-64 Options] {{Webarchive |url=https://web.archive.org/web/20150116065447/http://gcc.gnu.org/onlinedocs/gcc/i386-and-x86-64-Options.html |date=2015-01-16}}.</ref>
<ref name="float_128">{{cite web |url=https://stackoverflow.com/questions/13516476 |title=long double (GCC specific) and __float128 |website=StackOverflow}}</ref>
<ref name="ARM_2013_AArch64">{{cite web |title=Procedure Call Standard for the ARM 64-bit Architecture (AArch64) |url=http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf |archive-url=https://web.archive.org/web/20130731181404/http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf |archive-date=2013-07-31 |url-status=live |date=2013-05-22 |access-date=2019-09-22}}</ref>
<ref name="ARM_2013_Compiler">{{cite web |title=ARM Compiler toolchain Compiler Reference, Version 5.03 |url=http://infocenter.arm.com/help/topic/com.arm.doc.dui0491i/DUI0491I_arm_compiler_reference.pdf |archive-url=https://web.archive.org/web/20150627210618/http://infocenter.arm.com/help/topic/com.arm.doc.dui0491i/DUI0491I_arm_compiler_reference.pdf |archive-date=2015-06-27 |url-status=live |at=Section 6.3 ''Basic data types'' |date=2013 |access-date=2019-11-08}}</ref>
<ref name="Kharya_2020">{{Cite web |title=TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x |url=https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/ |last=Kharya |first=Paresh |date=May 14, 2020 |access-date=May 16, 2020}}</ref>
<ref name="Sierra_1962">{{US patent reference |number=3037701A |issue-date=1962-06-05 |inventor=Huberto M Sierra |title=Floating decimal point arithmetic control means for calculator}}</ref>
<ref name="Barker">Christopher Barker: [https://www.python.org/dev/peps/pep-0485/ ''PEP 485 -- A Function for testing approximate equality'']</ref>
<ref name="C99">{{cite book |title=ISO/IEC 9899:1999 - Programming languages - C |publisher=Iso.org |at=§F.2, note 307 |quote="Extended" is IEC 60559's double-extended data format. Extended refers to both the common 80-bit and quadruple 128-bit IEC 60559 formats.}}</ref>
<ref name="Microsoft_2006_KB35826">{{cite web |title=IEEE vs. Microsoft Binary Format; Rounding Issues (Complete) |publisher=[[Microsoft]] |work=Microsoft Support |id=Article ID KB35826, Q35826 |date=2006-11-21 |url=https://www.betaarchive.com/wiki/index.php/Microsoft_KB_Archive/35826#IEEE_vs._Microsoft_Binary_Format.3B_Rounding_Issues_.28Complete.29 |access-date=2010-02-24 |url-status=live |archive-url=https://web.archive.org/web/20200828130651/https://www.betaarchive.com/wiki/index.php/Microsoft_KB_Archive/35826 |archive-date=2020-08-28}}</ref>
<ref name="Steil_2008_6502">{{cite web |title=Create your own Version of Microsoft BASIC for 6502 |first=Michael |last=Steil |publisher=pagetable.com |date=2008-10-20 |url=http://www.pagetable.com/?p=46 |access-date=2016-05-30 |url-status=live |archive-url=https://web.archive.org/web/20160530092603/http://www.pagetable.com/?p=46 |archive-date=2016-05-30}}</ref>
<ref name="Borland_1994_MBF">{{cite web |title=Converting between Microsoft Binary and IEEE formats |date=1998-07-02 |orig-date=1994-03-10 |id=ID 1400 |author=Borland staff |publisher=[[Embarcadero USA]] / [[Inprise]] (originally: [[Borland]]) |work=Technical Information Database |type=TI1431C.txt |url=https://community.embarcadero.com/index.php/article/technical-articles/162-programming/14799-converting-between-microsoft-binary-and-ieee-forma |access-date=2016-05-30 |url-status=live |archive-url=https://web.archive.org/web/20190220230417/https://community.embarcadero.com/index.php/article/technical-articles/162-programming/14799-converting-between-microsoft-binary-and-ieee-forma |archive-date=2019-02-20 |quote=[…] _fmsbintoieee(float *src4, float *dest4) […] MS Binary Format […] byte order => m3 {{!}} m2 {{!}} m1 {{!}} exponent […] m1 is [[most significant byte]] => sbbb{{!}}bbbb […] m3 is the [[least significant byte]] […] m = mantissa byte […] s = sign bit […] b = bit […] MBF is bias 128 and IEEE is bias 127. […] MBF places the [[decimal point]] before the [[assumed bit]], while IEEE places the decimal point after the assumed bit. […] ieee_exp = msbin[3] - 2; /* actually, msbin[3]-1-128+127 */ […] _dmsbintoieee(double *src8, double *dest8) […] MS Binary Format […] byte order =>  m7 {{!}} m6 {{!}} m5 {{!}} m4 {{!}} m3 {{!}} m2 {{!}} m1 {{!}} exponent […] m1 is most significant byte => smmm{{!}}mmmm […] m7 is the least significant byte […] MBF is bias 128 and IEEE is bias 1023. […] MBF places the decimal point before the assumed bit, while IEEE places the decimal point after the assumed bit. […] ieee_exp = msbin[7] - 128 - 1 + 1023; […]}}</ref>
<ref name="NVIDIA_Hopper">{{cite web |title=NVIDIA Hopper Architecture In-Depth |date=22 March 2022 |url=https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/}}</ref>
<ref name="Micikevicius_2022">{{cite arXiv |last1=Micikevicius |first1=Paulius |last2=Stosic |first2=Dusan |last3=Burgess |first3=Neil |last4=Cornea |first4=Marius |last5=Dubey |first5=Pradeep |last6=Grisenthwaite |first6=Richard |last7=Ha |first7=Sangwon |last8=Heinecke |first8=Alexander |last9=Judd |first9=Patrick |last10=Kamalu |first10=John |last11=Mellempudi |first11=Naveen |last12=Oberman |first12=Stuart |last13=Shoeybi |first13=Mohammad |last14=Siu |first14=Michael |last15=Wu |first15=Hao |eprint=2209.05433 |title=FP8 Formats for Deep Learning |class=cs.LG |date=2022-09-12}}</ref>
<ref name="Gay_1990">{{cite tech report |last=Gay |first=David M. |title=Correctly Rounded Binary-Decimal and Decimal-Binary Conversions |date=1990 |institution=NUMERICAL ANALYSIS MANUSCRIPT 90-10, AT&T BELL LABORATORIES |citeseerx=10.1.1.31.4049}} ([http://www.netlib.org/fp/dtoa.c dtoa.c in netlab])</ref>
<ref name="Loitsch_2010">{{cite conference
|last=Loitsch
|first=Florian
|title=Printing floating-point numbers quickly and accurately with integers
|conference=PLDI '10: ACM SIGPLAN Conference on Programming Language Design and Implementation
|book-title=Proceedings of the 31st ACM SIGPLAN Conference on Programming Language Design and Implementation
|date=2010
|pages=233–243
|doi=10.1145/1806596.1806623
|isbn=978-1-45030019-3
|s2cid=910409
|url=https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf
|archive-url=https://web.archive.org/web/20140729005717/http://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf |archive-date=2014-07-29
|url-status=live
}}</ref>
<ref name="mazong">{{cite web |title=Added Grisu3 algorithm support for double.ToString(). by mazong1123 · Pull Request #14646 · dotnet/coreclr |url=https://github.com/dotnet/coreclr/pull/14646 |website=GitHub |language=en}}</ref>
<ref name="Adams_2018">{{cite journal |last=Adams |first=Ulf |title=Ryū: fast float-to-string conversion |journal=ACM SIGPLAN Notices |date=2 December 2018 |volume=53 |issue=4 |pages=270–282 |doi=10.1145/3296979.3192369 |s2cid=218472153 |doi-access=free}}</ref>
<ref name="Giulietti">{{cite web |last=Giulietti |first=Rafaello |title=The Schubfach way to render doubles |url=https://drive.google.com/file/d/1IEeATSVnEE6TkrHlCYNY2GjaraBjOT4f}}</ref>
<ref name="abolz">{{cite web |title=abolz/Drachennest |website=[[GitHub]] |url=https://github.com/abolz/Drachennest |date=10 November 2022}}</ref>
<ref name="double_conversion_2020">{{cite web |title=google/double-conversion |website=[[GitHub]] |url=https://github.com/google/double-conversion |date=21 September 2020}}</ref>
<ref name="Lemire_2021">{{cite journal |last=Lemire |first=Daniel |title=Number parsing at a gigabyte per second |journal=Software: Practice and Experience |date=22 March 2021 |volume=51 |issue=8 |pages=1700–1727 |doi=10.1002/spe.2984 |arxiv=2101.11408 |s2cid=231718830}}</ref>
<ref name="Patterson-Hennessy_2014">{{cite book |last1=Patterson |first1=David A. |last2=Hennessy |first2=John L. |title=Computer Organization and Design, The Hardware/Software Interface |publisher=Elsevier |series=The Morgan Kaufmann series in computer architecture and design |edition=5th |date=2014 |location=Waltham, Massachusetts, USA |pages=793 |language=en |isbn=978-9-86605267-5}}<!-- {{rp|218–220}} --></ref>
<ref name="Vectorizers">{{cite web |title=Auto-Vectorization in LLVM |url=https://llvm.org/docs/Vectorizers.html |website=LLVM 13 documentation |quote=We support floating point reduction operations when -ffast-math is used.}}</ref>
<ref name="FPM">{{cite web |title=FloatingPointMath |url=https://gcc.gnu.org/wiki/FloatingPointMath |website=GCC Wiki}}</ref>
<ref name="harmful">{{cite web |title=55522 – -funsafe-math-optimizations is unexpectedly harmful, especially w/ -shared |url=https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55522 |website=gcc.gnu.org}}</ref>
<ref name="Gen">{{cite web |title=Code Gen Options (The GNU Fortran Compiler) |url=https://gcc.gnu.org/onlinedocs/gfortran/Code-Gen-Options.html |website=gcc.gnu.org}}</ref>
<ref name="zheevd">{{cite web |title=Bug in zheevd · Issue #43 · Reference-LAPACK/lapack |url=https://github.com/Reference-LAPACK/lapack/issues/43 |website=GitHub |language=en}}</ref>
<ref name="Becker-Darulova-Myreen-Tatlock_2019">{{cite conference |last1=Becker |first1=Heiko |last2=Darulova |first2=Eva |last3=Myreen |first3=Magnus O. |last4=Tatlock |first4=Zachary |title=Icing: Supporting Fast-Math Style Optimizations in a Verified Compiler |conference=CAV 2019: Computer Aided Verification |date=2019 |volume=11562 |pages=155–173 |doi=10.1007/978-3-030-25543-5_10 |doi-access=free}}</ref>
}}