%&latex
\documentclass[final,pdf,slideColor,colorBG]{prosper}

\usepackage[toc,highlight,hlsections,capsules]{HA-prosper}
\usepackage{amsmath}
\usepackage{graphicx,wrapfig}
\usepackage{bibentry,natbib}
\usepackage{dimadefs}

\title{Fast Fourier Transform}
\subtitle{Key Papers in Computer Science Seminar 2005}
\author{
  Dima Batenkov\\
  \institution{Weizmann Institute of Science}\\
  \email{dima.batenkov@gmail.com}\\
}


\newcommand{\fourier}{\ensuremath{\mathcal{F}}}
\newcommand{\unityroot}{\ensuremath{\mathbf{W}}}

\newcommand{\thelogo}{\includegraphics{images/logo1_blue}}

\renewcommand{\vec}[1]{\ensuremath{\mathbf{#1}}}

\newtheorem{thm}{Theorem}
\newtheorem{dfn}{Definition}

\HAPsetup{counters={thm,dfn},stype=0,sn={p.\thepage\ifallPages/\totalpages\fi}}

\Logo(0,0){\thelogo}

\bibliographystyle{plainnat}
%\newcommand{\bibfont}{\tiny}
\citestyle{nature}

\begin{document}

\nobibliography{lecture-bib}

\maketitle

\overlays{4}{
\begin{slide}{Fast Fourier Transform - Overview}
  \bibentry{cotu:65}
  \begin{itemstep}[sstart=2]
    \xitem A fast algorithm for computing the Discrete
    Fourier Transform
    \xitem (Re)discovered by Cooley \& Tukey in 1965 \cite{cotu:65}
    and widely adopted thereafter
    \xitem Has a long and fascinating history
  \end{itemstep}
\end{slide}
}

\tsectionandpart*{Fourier Analysis}

\newcommand{\period}{\ensuremath{L}}
\newcommand{\trigargument}{\ensuremath{\frac{\pi n}{\period}}}
\overlays{4}{
\begin{slide}{Fourier Series}
  \begin{itemstep}
    \onSlide*{1}
    {
    \item Expresses a (real) periodic function $x(t)$ as a sum of
      trigonometric series ($-\period < t < \period$)
      \begin{align*}
        x(t) = \frac{1}{2}a_0 + \sum_{n=1}^{\infty}(a_n \cos{\trigargument t} +
        b_n \sin{\trigargument t})
      \end{align*}
    \item Coefficients can be computed by
      \begin{align*}
        a_n &= \frac{1}{\period}\int_{-\period}^{\period} x(t)
        \cos{\trigargument t}\,dt\\
        b_n &= \frac{1}{\period}\int_{-\period}^{\period} x(t)
        \sin{\trigargument t}\,dt
      \end{align*}
    }
    
    \onSlide*{2-4}{
    \item Generalized to complex-valued functions as
      \begin{align*}
        x(t) &= \sum_{n=-\infty}^{\infty} c_n e^{i  \trigargument t}\\
        c_n &= \frac{1}{2\period}\int_{-\period}^{\period} x(t)
        e^{-i  \trigargument t}\,dt
      \end{align*}
    }
    
    \onSlide*{3-4}{
    \item Studied by D.Bernoulli and L.Euler
    \item Used by Fourier to solve the heat equation
    }
    \onSlide*{4}{
    \item Converges for almost all ``nice'' functions (piecewise
      smooth, $L^2$ etc.)
    }
  \end{itemstep}
\end{slide}
}

\begin{slide}{Continuous Fourier Transform}
  \begin{itemize}
  \item Generalization of Fourier Series for infinite domains
    \begin{align*}
        x(t) &= \int_{-\infty}^{\infty} \fourier(f) e^{-2\pi i f
          t}\,df \\
        \fourier(f) &= \int_{-\infty}^{\infty} x(t) e^{2\pi i f t}\,dt
    \end{align*}
  \item Can represent continuous, aperiodic signals
  \item Continuous frequency spectrum
  \end{itemize}
\end{slide}

\begin{slide}{Discrete Fourier Transform}
  \begin{itemize}
  \item If the signal $X(k)$ is periodic, band-limited and sampled at Nyquist frequency
    or higher, the DFT represents the CFT exactly \cite{Shannon49}
    \begin{align*}
      A(r) = \sum_{k=0}^{N-1} X(k)\unityroot_N^{rk}
    \end{align*}
    where $\unityroot_N = e^{-\frac{2\pi i}{N}}$ and $r=0,1,\dots,N-1$
  \item The inverse transform:
    \begin{align*}
      X(j) = \frac{1}{N}\sum_{k=0}^{N-1} A(k) \unityroot_N^{-jk}
    \end{align*}
  \end{itemize}
\end{slide}

\begin{slide}{Useful properties \cite{gentleman:66}}
  \begin{itemize}
  \item Orthogonality of basis functions
    \begin{align*}
      \sum_{k=0}^{N-1} \unityroot_N^{r'k} \unityroot_N^{rk} = N
      \delta_N (r-r')
    \end{align*}
  \item Linearity
  \item (Cyclic) Convolution theorem
    \begin{align*}
      (x*y)_n &= \sum_{k=0}^{N-1} x(k) y(n-k)\\
      \Rightarrow \fourier\{(x*y)\} &= \fourier\{X\} \fourier\{Y\}
    \end{align*}
  \item Symmetries: real $\leftrightarrow$ hermitian symmetric,
    imaginary $\leftrightarrow$ hermitian antisymmetric
  \item Shifting theorems
  \end{itemize}
\end{slide}

\begin{slide}{Applications}
  \begin{itemize}
  \item Approximation by trigonometric polynomials
    \begin{itemize}
    \item Data compression (MP3...)
    \end{itemize}
  \item Spectral analysis of signals
  \item Frequency response of systems
  \item Partial Differential Equations
  \item Convolution via frequency domain
    \begin{itemize}
    \item Cross-correlation
    \item Multiplication of large integers
    \item Symbolic multiplication of polynomials
    \end{itemize}
  \end{itemize}
\end{slide}


\tsectionandpart*{Methods known by 1965}

\begin{slide}{Available methods}
  \begin{itemize}
  \item In many applications, large digitized datasets are becoming available,
    but cannot be processed due to prohibitive running time of DFT
  \item All (publicly known) methods for efficient computation utilize
    the symmetry of trigonometric functions, but still are $O(N^2)$
  \item Goertzel's method \cite{goertzel:58} is fastest known
  \end{itemize}
\end{slide}

\renewcommand{\trigargument}{\frac{2\pi j}{N}}
\overlays{4}{
\begin{slide}{Goertzel's algorithm \cite{goertzel:58}}
  \onSlide*{1-3}
  {
    \begin{itemize}
    \item Given a particular $0 < j < N-1$, we want to evaluate 
      \begin{align*}
        A(j)&=\sum_{k=0}^{N-1} x(k) \cos \left(\trigargument k\right)
        & 
        B(j)&=\sum_{k=0}^{N-1} x(k) \sin \left(\trigargument k\right)
      \end{align*}
      \onSlide*{2-3}{
      \item Recursively compute the sequence $\{u(s)\}_{s=0}^{N+1}$
        \begin{align*}
          u(s) &= x(s) + 2\cos\left(\trigargument \right) u(s+1) - u(s+2)
          \\
          u(N)&=u(N+1)=0
        \end{align*}
      }
      \onSlide*{3}
      {
      \item Then
        \begin{align*}
          B(j) &= u(1) \sin \left( \trigargument \right)\\
          A(j) &= x(0) + \cos \left(\trigargument \right) u(1) - u(2)
        \end{align*}
      }
    \end{itemize}
  }
  \onSlide*{4}
  {
    \begin{itemize}
    \item Requires $N$ multiplications and only one sine and cosine
    \item Roundoff errors grow rapidly \cite{Gentleman:1969:EAG}
    \item Excellent for computing a very small number of coefficients
      ($< \log N$)
    \item Can be implemented using only 2 intermediate storage
      locations and processing input signal as it arrives
    \item Used today for Dual-Tone Multi-Frequency detection (tone
      dialing)
    \end{itemize}
  }  
\end{slide}
}


\tsectionandpart*{Inspiration for the FFT}

\overlays{6}{
\begin{slide}{Factorial experiments}
  \onSlide*{1}{
    Purpose: investigate the effect of $n$ factors on some measured quantity
    \begin{itemize}
    \item Each factor can have $t$ distinct levels. For our discussion
      let $t=2$, so there are 2 levels: ``$+$'' and ``$-$''.
    \item So we conduct $t^n$ experiments (for every possible
      combination of the $n$ factors) and calculate:
      \begin{itemize}
      \item \emph{Main effect} of a factor: what is the average change in the
        output as the factor goes from ``$-$'' to ``$+$''?
      \item \emph{Interaction effects} between two or more factors: what
        is the difference in output between the case when both factors are present
        compared to when only one of them is present?
      \end{itemize}
    \end{itemize}
  }
  \onSlide*{2}{
    Example of $t=2,\, n=3$ experiment
    \begin{itemize}
    \item Say we have factors $A,B,C$ which can be ``high'' or
      ``low''. For example, they can represent levels of 3 different
      drugs given to patients
    \item Perform $2^3$ measurements of some quantity, say blood
      pressure
    \item Investigate how each one of the drugs and their combination
      affect the blood pressure
    \end{itemize}
  }
  \onSlide*{3}
  {
    \begin{align*}
      \begin{matrix}
        \text{Exp.} & \text{Combination} & A & B & C & \text{Blood
          pressure}\\
        \hline\\
        1 & (1) & - & - & - & x_1\\
        2 &  a  & + & - & - & x_2\\
        3 &  b  & - & + & - & x_3\\
        4 &  ab & + & + & - & x_4\\
        5 &  c  & - & - & + & x_5\\
        6 &  ac & + & - & + & x_6\\
        7 &  bc & - & + & + & x_7\\
        8 & abc & + & + & + & x_8\\
      \end{matrix}
    \end{align*}
  }
  \onSlide*{4-5}
  {
    \begin{itemize}
    \item The main effect of $A$ is $(ab-b)+(a-(1))+(abc-bc)+(ac-c)$
    \item The interaction of $A,B$ is
      $(ab-b)-(a-(1))-((abc-bc)-(ac-c))$
    \item etc.
    \end{itemize}
  }
  \onSlide*{5}
  {
    If the main effects and interactions are arranged in vector
    \vec{y}, then we can write $\vec{y}=A \vec{x}$ where
    \begin{align*}
      A = 
      \begin{pmatrix}
        1&1&1&1&1&1&1&1\\
        1&-1&-1&1&1&-1&-1&1\\
        1&1&-1&-1&1&1&-1&-1\\
        1&-1&-1&1&-1&1&-1&1\\
        1&1&1&1&-1&-1&-1&-1\\
        1&-1&1&-1&-1&1&-1&1\\
        1&1&-1&-1&-1&-1&1&1\\
        1&-1&-1&1&-1&1&1&-1\\
      \end{pmatrix}
    \end{align*}
  }
\onSlide*{6}
{
  \begin{itemize}
  \item F.Yates devised \cite{Yates37} an iterative $n$-step algorithm which
    simplifies the calculation. Equivalent to decomposing $A=B^n$,
    where (for our case of $n=3$):
    \begin{align*}
      B = 
      \begin{pmatrix}
        1&1&0&0&0&0&0&0\\
        0&0&1&1&0&0&0&0\\
        0&0&0&0&1&1&0&0\\
        0&0&0&0&0&0&1&1\\
        -1&-1&0&0&0&0&0&0\\
        0&0&-1&-1&0&0&0&0\\
        0&0&0&0&-1&-1&0&0\\
        0&0&0&0&0&0&-1&-1\\
      \end{pmatrix}
    \end{align*}
    so that much less operations are required
  \end{itemize}
}
\end{slide}
}

\newcommand{\matr}[1]{\ensuremath{M^{(#1)}}}

\overlays{8}{
\begin{slide}{Good's paper \cite{Good58}}
  \onSlide*{1}
  {
    \bibentry{Good58}
    \begin{itemize}
    \item Essentially develops a ``prime-factor'' FFT
    \item The idea is inspired by Yates' efficient algorithm, but Good
      proves general theorems which can be applied to the task
      of efficiently multiplying a $N$-vector by certain $N\times N$
      matrices
    \item The work remains unnoticed until Cooley-Tukey publication
    \item Good meets Tukey in 1956 and tells him briefly about his
      method
    \item It is revealed later \cite{ConversationGood} that the FFT
      had not been discovered much earlier by a coincedence
    \end{itemize}
  }
  \onSlide*{2-3}
  {
    \begin{dfn}
      Let \matr{i}\/ be $t \times t$ matrices, $i=1\dots n$. Then the direct product
      \[
      \matr{1} \times \matr{2} \times \dots \times \matr{n}
      \]
      is a $t^n \times t^n$ matrix $C$ whose elements are
      \[
      C_{r,s} = \prod_{i=1}^n \matr{i}_{r_i,s_i}
      \]
      where $r=(r_1,\dots,r_n)$ and $s=(s_1,\dots\,s_n)$ are the base-$t$
      representation of the index
    \end{dfn}
  }
  \onSlide*{3}
  {
    \begin{thm}\label{thm:Good}
      For every $M$, $M^{[n]} = A^n$ where
      \[
      A _{r,s}= \{M_{r_1,s_n} \delta^{s_1}_{r_2} \delta^{s_2}_{r_3} \dots \delta^{s_{n-1}}_{r_n}\}
      \]
    \end{thm}
    Interpretation: $A$ has at most $t^{n+1}$ nonzero elements. So if,
    for a given $B$ we can find $M$ such that $M^{[n]}=B$, then we can
    compute $B \vec{x}=A^n \vec{x}$ in $O(nt^n)$ instead of $O(t^{2n})$.
  }
  \onSlide*{4-6}
  {
    \textsl{Good's observations}

    \onSlide*{4}
    {
      \begin{itemize}
      \item For $2^n$ factorial experiment:
        \begin{align*}
          M = 
          \begin{pmatrix}
            1 & 1\\
            1 & -1
          \end{pmatrix}
        \end{align*}

      \end{itemize}
    }
    \onSlide*{5-6}
    {
      \begin{itemize}
      \item $n$-dimensional DFT, size $N$ in each dimension
        \begin{align*}
          A(s_1,\dots,s_n) =
          \sum_{(r_1,\dots,r_n)}^{(N-1,\dots,N-1)}x(\vec{r})\unityroot_N^{r_1
            s_1} \times \dots \times \unityroot_N^{r_n s_n}
        \end{align*}
        can be represented as
        \begin{align*}
          A = \Omega^{[n]} \vec{x},\, \Omega = \{\unityroot_N^{rs}\} \quad
          r,s=0, \dots, N-1
        \end{align*}
        By the theorem, $A = \Psi^n \vec{x}$ where $\Psi$ is sparse
        \onSlide*{6}
        {
        \item $n$-dimensional DFT, $N_i$ in each dimension
          \begin{align*}
            A &= (\Omega^{(1)} \times \dots \times \Omega^{(n)}) \vec{x}
            = \Phi^{(1)} \Phi^{(2)} \dots \Phi^{(n)} \vec{x}\\
            \Omega^{(i)} &= \{\unityroot_{N_i}^{rs}\}\\
            \Phi^{(i)} &= I_{N_1} \times \dots \times \Omega^{(i)}
            \times \dots \times I_{N_n}
          \end{align*}
        }
      \end{itemize}
    }
  }
  \onSlide*{7}
  {
    If $N=N_1 N_2$ and $(N_1,N_2)=1$ then $N$-point 1-D DFT can be represented
    as 2-D DFT
    \begin{align*}
      r &= N_2 r_1 + N_1 r_2 \\
      \explntext{Chinese Remainder Thm.} s &= N_2 s_1 (N_2^{-1})_{\mod N_1} + N_1 s_2
      (N_1^{-1})_{\mod N_2} \\
      \Rightarrow s &\equiv s_1 \mod N_1, \quad s\equiv s_2 \mod N_2
    \end{align*}
    These are index mappings $s \leftrightarrow (s_1,s_2)$ and $r
    \leftrightarrow (r_1,r_2)$
    \begin{align*}
      A(s)=A(s_1,s_2) &= \sum_{r=0}^{N} x(r) \unityroot_N^{rs} = \sum_{r_1=0}^{N_1-1} \sum_{r_2=0}^{N_2-1}
      x(r_1,r_2) \unityroot_{N_1 N_2}^{rs} \\
      \explntext{substitute $r,s$ from above}&= \sum_{r_1=0}^{N_1-1} \sum_{r_2=0}^{N_2-1} x(r_1,r_2)
      \unityroot_{N_1}^{r_1 s_1} \unityroot_{N_2}^{r_2 s_2}
    \end{align*}
    This is exactly 2-D DFT seen above!
  }
  \onSlide*{8}
  {
    So, as above, we can write
    \begin{align*}
      A &= \Omega \vec{x} \\
      &= P (\Omega^{(1)} \times \Omega^{(2)}) Q^{-1} \vec{x} \\
      &= P \Phi^{(1)} \Phi^{(2)} Q^{-1} \vec{x}
    \end{align*}
    where $P$ and $Q$ are permutation matrices. This results in
    $N(N_1+N_2)$ multiplications instead of $N^2$.
  }
\end{slide}
}

\begin{slide}{Good's results - summary}
  \begin{itemize}
  \item $n$-dimensionanal, size $N_i$ in each dimension
    \begin{align*} 
      \left(\sum_n N_i\right) \left(\prod_n N_i\right) \text{ instead of } \left(\prod_n N_i\right)^2
    \end{align*}
  \item 1-D, size $N=\prod_n N_i$ where $N_i$'s are mutually prime
    \begin{align*}
      \left(\sum_n N_i\right) N \text{ instead of } N^2
    \end{align*}
  \item Still thinks that other ``tricks'' such as using the
    symmetries and sines and cosines of known angles might be more useful
  \end{itemize}
\end{slide}


\tsectionandpart*{The article}

\begin{slide}{James W.Cooley (1926-)}
  \begin{itemize}
    \dualslide{rcolwidth=30mm}
    {
    \item 1949 - B.A. in Mathematics (Manhattan College, NY)
    \item 1951 - M.A. in Mathematics (Columbia University)
    \item 1961 - Ph.D in Applied Mathematics (Columbia University)
    }
    {
      \includegraphics{images/cooley}    
    }
  \item 1962 - Joins IBM Watson Research Center
  \item The FFT is considered his most significant contribution to
    mathematics
  \item Member of IEEE Digital Signal Processing Committee
  \item Awarded IEEE fellowship on his works on FFT
  \item Contributed much to establishing the terminology of DSP
  \end{itemize}
\end{slide}

\begin{slide}{John Wilder Tukey (1915-2000)}
  \begin{itemize}
    \dualslide{rcolwidth=30mm}
    {
    \item 1936 - Bachelor in Chemistry (Brown University)
    \item 1937 - Master in Chemistry (Brown University)
    \item 1939 - PhD in Mathematics (Princeton) for ``Denumerability in
      Topology''
    }
    {
      \includegraphics{images/tukey}
    }
  \item During WWII joins Fire Control Research Office and contributes
    to war effort (mainly invlolving statistics)
  \item From 1945 also works with Shannon and Hamming in AT\&T Bell
    Labs
  \item Besides co-authoring the FFT in 1965, has several major
    contributions to statistics
  \end{itemize}
\end{slide}

\overlays{4}{
\begin{slide}{The story - meeting}
  \begin{itemstep}
    \xitem Richard Garwin (Columbia University IBM Watson Research
    center) meets John Tukey (professor of mathematics in Princeton)
    at Kennedy's Scientific Advisory Committee in 1963
    \begin{itemstep}
      \xitem Tukey shows how a Fourier series of length $N$
      when $N=ab$ is composite can be expressed as $a$-point series of
      $b$-point subseries, thereby reducing the number of computations
      from $N^2$ to $N(a+b)$
      
      \xitem Notes that if $N$ is a power of two, the algorithm can be
      repeated, giving $N \log_2 N$ operations
      
      \xitem Garwin, being aware of applications in various fields
      that would greatly benefit from an efficient algorithm for DFT,
      realizes the importance of this and from now on, ``pushes'' the
      FFT at every opportunity
    \end{itemstep}
  \end{itemstep}
\end{slide}
}   

\overlays{7}{
\begin{slide}{The story - publication}
  \begin{itemstep}
    \xitem Garwin goes to Jim Cooley (IBM Watson Research Center) with the
    notes from conversation with Tukey and asks to implement the
    algorithm

    \xitem Says that he needs the FFT for computing the 3-D Fourier
    transform of spin orientations of $He^3$. It is revealed later
    that what Garwin really wanted was to be able to detect nuclear
    exposions from seismic data

    \xitem Cooley finally writes a 3-D in-place FFT and sends it to
    Garwin
    
    \xitem The FFT is exposed to a number of mathematicians

    \xitem It is decided that the algorithm should be put in the
    public domain to prevent patenting
       
    \xitem Cooley writes the draft, Tukey adds some references to
    Good's article and the paper is submitted to Mathematics of Computation in  August 1964
    
    \xitem Published in April 1965
      
  \end{itemstep}
  
\end{slide}
}

\begin{slide}{What if...}
  \begin{itemize}
  \item Good reveals in 1993 \cite{ConversationGood} that Garwin
    visited him in 1957, but:
  \item ``Garwin told me with great enthusiasm about his experimental
    work... Unfortunately, not wanting to change the subject..., I
    didn't mention my FFT work at that dinner, although I had intended
    to do so. He wrote me on February 9, 1976 and said: ``Had we
    talked about it in 1957, I could have stolen [publicized] it from
    you instead of from John Tukey...''... That would have completely
    changed the history of the FFT''
  \end{itemize}
\end{slide}

\overlays{10}{
  \begin{wideslide}{Cooley-Tukey FFT}
    \onSlide*{1-7}{$A(j) = \sum_{k=0}^{N-1} X(k) \unityroot_{N}^{jk} \quad N=N_1 N_2$}
    \onSlide*{2-7}
    {
      \begin{itemize}
      \item Convert the 1-D indices into 2-D 
        \begin{align*}
          j&= j_0 + j_1 N_1 & j_0 &= 0,\dots,N_1-1 & j_1 &= 0,\dots,N_2-1
          \\
          k&= k_0 + k_1 N_2 & k_0 &= 0,\dots,N_2-1 & k_1 &= 0,\dots,N_1-1
        \end{align*}
        \onSlide*{3-7}{
        \item ``Decimate'' the original sequence into $N_2$ $N_1$-subsequences:
          \begin{align*}
            \onSlide*{4-7}{A(j_1,j_0) &= \sum_{k=0}^{N-1} X(k)
              \unityroot_N^{jk} &\\}
            \onSlide*{5-7}{
              &= \sum_{k_0=0}^{N_2-1}\sum_{k_1=0}^{N_1-1} X(k_1,k_0)
              \unityroot_{N_1 N_2}^{(j_0+j_1 N_1)(k_0+k_1 N_2)} &\\}
            \onSlide*{6-7}{
              &= \sum_{k_0=0}^{N_2-1} \unityroot_N^{j_0 k_0}
              \unityroot_{N_2}^{j_1 k_0} \sum_{k_1=0}^{N_1-1} X(k_1,k_0)
              \unityroot_{N_1}^{j_0 k_1} \onSlide*{7}{ &=
                \sum_{k_0=0}^{N_2-1} \unityroot_N^{j_0 k_0}
                 X_{j_0}(k_0) \unityroot_{N_2}^{j_1 k_0}}
            }
          \end{align*}
        }
      \end{itemize}
    }
    \onSlide*{8}
    {
      \begin{itemize}
      \item The sequences $X_{j_0}$ have $N$ elements and require $N N_1$
        operations to compute
      \item Given these, the array $A$ requires $N N_2$ operations to
        compute
      \item Overall $N(N_1+N_2)$
      \item In general, if $N=\prod_{i=0}^n N_i$ then $N
        \left(\sum_{i=0}^{n} N_i\right)$ operations overall.
      \end{itemize}
    }
    \onSlide*{9}
    {
      \small
      If $N_1=N_2=\dots=N_n=2$ (i.e. $N=2^n$):
        \begin{align*}
          j &= j_{n-1}2^{n-1}+\dots+j_0,\quad k=k_{n-1}2^{n-1}+\dots+k_0\\
          A(j_{n-1},\dots,j_0) &= \sum_{k_0=0}^1 \dots
          \sum_{k_{n-1}=0}^1 X(k_{n-1},\dots,k_0) \unityroot_N^{jk} \\
%        
          &= \sum_{k_0} \unityroot_N^{jk_0} \sum_{k_1}
          \unityroot_N^{jk_1\times 2} \dots \sum_{k_{n-1}}
          \unityroot_N^{jk_{n-1}\times 2^{n-1}} X(k_{n-1},\dots,k_0)\\
%          
          \Rightarrow X_{j_0}(k_{n-2},\dots,k_0)&= \sum_{k_{n-1}}
           X(k_{n-1},\dots,k_0) \unityroot_2^{j_0 k_{n-1}}\\
%          
          X_{j_1,j_0}(k_{n-3},\dots,k_0)&=\sum_{k_{n-2}}
          \unityroot_4^{j_0 k_{n-2}} X_{j_0}(k_{n-2},\dots,k_0)
          \unityroot_2^{j_1 k_{n-2}}\\
%
          X_{j_{m-1},\dots,j_0}(k_{n-m-1},\dots,k_0) &= \sum_{k_{n-m}}
          \unityroot_{2^m}^{(j_0+\dots+j_{m-2}\times
            2^{m-2})k_{n-m}} X_{j_{m-2},\dots,j_0}(k_{n-m},\dots,k_0) \times
          \\
          & \times \unityroot_2^{j_{m-1}k_{n-m}}
        \end{align*}
    }
    \onSlide*{10}
    {
      Finally, $A(j_{n-1},\dots,j_0)=X_{j_{n-1},\dots,j_0}$
      \begin{itemize}
      \item Two important properties:
        \begin{itemize}
        \item In-place: only two terms are involved in the calculation
          of every intermediate pair $X_{j_{m-1},\dots,j_0}(\dots)$ $j_{m-1}=0,1$:
          \begin{itemize}
          \item $X_{j_{m-2},\dots,j_0}(0,\dots)$
          \item $X_{j_{m-2},\dots,j_0}(1,\dots)$
          \end{itemize}
          So the values can be overwritten and no more additional
          storage be used
        \item Bit-reversal
          \begin{itemize}
          \item It is convenient to store
            $X_{j_{m-1},\dots,j_0}(k_{n-m-1},\dots,k_0)$ at index $j_0
            2^{n-1} + \dots +
            j_{m-1}2^{n-m}+k_{n-m-1}2^{m-n-1}+\dots+k_0$
          \item The result must be reversed

          \end{itemize}
          
        \end{itemize}
      \item Cooley \& Tukey didn't realize that the algorithm in fact
        builds large DFT's from smaller ones with some multiplications
        along the way.
      \item The terms $\unityroot_{2^m}^{(j_0+\dots+j_{m-2}\times
            2^{m-2})k_{n-m}}$ are later called\cite{gentleman:66} ``twiddle factors''.
      \end{itemize}
    }
  \end{wideslide}
}

\begin{slide}{FFT properties}
  \begin{itemize}
  \item Roundoff error significantly reduced compared to defining
    formula \cite{gentleman:66}
  \item A lower bound of $\frac{1}{2}n \log_2 n$ operations over
    \complexfield\/ for linear algorithms is proved
    \cite{Morgenstern73}, so FFT is in a sense optimal
  \item Two ``canonical'' FFTs
    \begin{itemize}
    \item Decimation-in-Time: the Cooley \& Tukey version. Equivalent
      to taking $N_1=N/2,\quad N_2=2$. Then
      $X_{j_0}(0)$ are the DFT coefficients of even-numbered samples
      and $X_{j_0}(1)$ - those of odd-numbered. The final coefficients
      are simply linear combination of the two.
    \item Decimation-in-Frequency (Tukey-Sande\cite{gentleman:66}):
      $N_1=2,\quad N_2=N/2$. Then $Y(*)=X_0(*)$ is the sum of
      even-numbered and odd-numbered samples, and $Z(*)=X_1(*)$
      is the difference. The even Fourier coefficients are the DFT of
      $Y$, and the odd-numbered - ``weighted'' DFT of $Z$.
    \end{itemize}
  \end{itemize}
\end{slide}

\overlays{3}{
\begin{slide}{DIT signal flow}
  \onSlide*{1}{\includegraphics{images/dit-stage1}}
  \onSlide*{2}{\includegraphics{images/dit-stage2}}
  \onSlide*{3}{\includegraphics{images/dit-stage3}}
\end{slide}
}

\overlays{3}{
\begin{slide}{DIF signal flow}
  \onSlide*{1}{\includegraphics{images/dif-stage1}}
  \onSlide*{2}{\includegraphics{images/dif-stage2}}
  \onSlide*{3}{\includegraphics{images/dif-stage3}}
\end{slide}
}

\tsectionandpart*{Newly discovered history of FFT}

\begin{slide}{What happened after the publication}
    \begin{itemize}
    \item Rudnick's note \cite{Rudnick:1966:NCF} mentions
      Danielson-Lanczos paper from 1942 \cite{DanielsonLa:42}. ``Although
      ... less elegant and is phrased wholly in terms of real
      quantities, it yields the same results as the binary form of the
      Cooley-Tukey algorithm with a comparable number of arithmetical operations''.
    \item ``Historical Notes on FFT'' \cite{Cooley:67}
      \begin{itemize}
      \item Also mentions Danielson-Lanczos
      \item Thomas and Prime Factor algorithm (Good) - quite
        distinct from Cooley-Tukey FFT
      \end{itemize}
    \item A later investigation\cite{hejb:85} revealed that Gauss
      essentially discovered the Cooley-Tukey FFT in 1805(!)
    \end{itemize}
\end{slide}

\overlays{8}{
\begin{wideslide}{Danielson-Lanczos}
  \onSlide*{1-5}{
    \begin{itemize}
      \item \bibentry{DanielsonLa:42}
      \onSlide*{2-5}{\item ``... the available standard forms become impractical for a
        large number of coefficients. We shall show that, by a certain
        transformation process, it is possible to double the number of
        ordinates with only slightly more than double the labor''}
      \onSlide*{3-5}{\item Concerned with improving efficiency of hand
        calculation}
      \onSlide*{4-5}{\item ``We shall now describe a method which eliminates the
        necessity of complicated schemes by reducing ... analysis for $4n$
        coefficients to two analyses for $2n$ coefficients''}
      \onSlide*{5}{
      \item ``If desired, this reduction process can be applied twice or
        three times'' (???!!)
      \item ``Adoping these improvements the approximate times...  are:
        10 minutes for 8 coefficients, 25 min. for 16, 60 min. for 32
        and 140 min. for 64'' ($\approx 0.37 N \log_2 N$)
      }
    \end{itemize}
  }
  \onSlide*{6-8}
  {
    \begin{itemize}
    \item Take the DIT Cooley-Tukey with $N_1=N/2,\quad N_2=2$. Recall
      \begin{align*}
        A(j_0+\frac{N}{2} j_1) &= \sum_{k_1=0}^{N/2-1}X(2 k_1)
        \unityroot_{N/2}^{j_0 k_1} + \unityroot_{N}^{j_0 + \frac{N}{2}
          j_1} \sum_{k_1=0}^{N/2-1} X(2k_1+1) \unityroot_{N/2}^{j_0 k_1}
      \end{align*}

      \onSlide*{7-8}{
      \item Danielson-Lanczos showed \emph{completely analogous} result, under the
        framework of real trigonometric series:
        \begin{itemize}
        \item ``...the contribution of the even ordinates to a Fourier
          analysis of $4n$ coefficients is exactly the same as the
          contribution of all the ordinates to a Fourier analysis of $2n$
          coefficients
        \item ``..contribution of the odd ordinates is also reducible to
          the same scheme... by means of a transformation process
          introducing a phase difference''
        \item ``...we see that, apart from the weight factors $2 \cos
          (\pi/{4n})k$ and $2 \sin(\pi/{4n})k$, the calculation is
          identical to the cosine analysis of half the number of ordinates''
          
          \onSlide*{8}
          {
          \item These ``weight factors'' are exactly the ``twiddle factors''
            $\unityroot_N^{j}$ above
          }
        \end{itemize}
      }
    \end{itemize}
  }
\end{wideslide}
}

\begin{slide}{Gauss}
  \begin{itemize}
  \item Herman H. Goldstine writes in a footnote of ``A History of
    Numerical Analysis from the 16th through  the 19th Century''
    (1977) \cite{Goldstine:1977:HNA}: 
    \begin{quote}
      ``This fascinating work of Gauss was neglected and rediscovered by
      Cooley and Tukey in an important paper in 1965''
    \end{quote}
  \item This goes largely unnoticed until the research by Heideman,
    Johnson and Burrus \cite{hejb:85} in 1985
    \begin{itemize}
    \item Essentially develops an DIF FFT for two factors and real
      sequences. Gives examples for $N=12,36$.
    \item Declares that it can be generalized to more than 2 factors,
      although no examples are given
    \item Uses the algorithm for solving the problem of determining
      orbit parameters of asteroids
    \item The treatise was published posthumously (1866). By then
      other numerical methods were preferred to DFT, so nobody found
      this interesting enough...
    \end{itemize}
  \end{itemize}
  
\end{slide}

\tsectionandpart*{Impact}

\begin{slide}{Impact}
  \begin{itemize}
  \item Certainly a ``classic''
  \item Two special issues of IEEE trans. on Audio and
    Electroacoustics devoted entirely to FFT\cite{IEEE69}
  \item Arden House Workshop on FFT \cite{IEEE69}
    \begin{itemize}
    \item Brought together people of very diverse specialities
    \item ``someday radio tuners will operate with digital processing
      units. I have heard this suggested with tongue in cheek, but one
      can speculate.'' - J.W.Cooley
    \end{itemize}
  \item Many people ``discovered'' the DFT via the FFT
  \end{itemize}
\end{slide}


\begin{slide}{Further developments}
 \begin{itemize}
 \item Real-FFT (DCT...)
 \item Parallel FFT
 \item FFT for prime $N$
 \item ...
 \end{itemize}
\end{slide}

\overlays{7}{
\begin{slide}{Concluding thoughts}
  \begin{itemstep}
  \xitem It is obvious that prompt recognition and publication of
  significant achievments is an important goal
  \xitem However, the publication in itself may not be enough
  \xitem Communication between mathematicians, numerical analysts and
  workers in a very wide range of applications can be very fruitful
  \xitem Always seek new analytical methods and not rely solely on increase in
    processing speed
  \xitem Careful attention to a review of old literature may offer
  some rewards
  \xitem Do not publish papers in Journal of Franklin Institute
  \xitem Do not publish papers in neo-classic Latin
  \end{itemstep}
\end{slide}
}

%\tsection*{Bibliography}

%\begin{wideslide}{References}
  \bibliography{fft-lecture-bib}
%\end{wideslide}

\end{document}