\documentclass{article}
\usepackage[paperwidth=297mm, paperheight=210mm, left=1mm, top=0mm, bottom=1mm, right=1mm]{geometry}
\usepackage[T1]{fontenc}
\usepackage{mathptmx}
\DeclareMathAlphabet{\mathcal}{OMS}{cmsy}{m}{n}
\usepackage{microtype}
\usepackage{amsmath,amssymb}
\usepackage{multicol}
\usepackage{titlesec}
\usepackage{enumitem}
\usepackage{stmaryrd}
\usepackage{xcolor}
\usepackage{graphicx}
\usepackage[printwatermark]{xwatermark}
%\newwatermark[allpages, color=red!100, scale=8]{$\mathcal{:)}$}
\titleformat{\section}
    {\color{red}\normalfont\scriptsize\bfseries}
    {\thesection}
    {1em}
    {}

\titleformat{\subsection}
  {\color{blue}\normalfont\scriptsize\bfseries}
  {\thesubsection}
  {1em}
  {}
\titlespacing{\section}{0pt}{0ex}{0ex}
\titlespacing{\subsection}{0pt}{0ex}{0ex}
\titlespacing{\subsubsection}{0pt}{0ex}{0ex}

\setlength{\parindent}{0pt}
\setlength{\parskip}{0cm}
\pagestyle{empty}
\setlength{\columnsep}{5pt}
\setlength{\columnseprule}{0.4pt}
\renewcommand{\baselinestretch}{0.85}
\small
\setlist[itemize]{itemsep=0pt, topsep=2pt, partopsep=0pt}
\setlist[enumerate]{itemsep=0pt, topsep=2pt, partopsep=0pt}

\setlist[itemize]{leftmargin=*, labelsep=2pt, itemsep=0pt, topsep=1pt, parsep=0pt, partopsep=0pt}
\setlist[enumerate]{leftmargin=*, labelsep=2pt, itemsep=0pt, topsep=1pt, parsep=0pt, partopsep=0pt}
% Empty canvas with four equally spaced vertical columns
\begin{document}
\fontsize{6pt}{7pt}\selectfont
% 2. THEN overwrite the math spacing defaults for this size
\setlength{\abovedisplayskip}{2pt}
\setlength{\belowdisplayskip}{2pt}
\setlength{\abovedisplayshortskip}{0pt}
\setlength{\belowdisplayshortskip}{0pt}
  
\begin{multicols*}{5}

%% ============================================
%% CATEGORY 1: PAC LEARNING & SAMPLE COMPLEXITY
%% ============================================
\section*{\underline{PAC Learning \& Sample Complexity}}
$\mathbf{\hat{h}}$: Selected predictor (minimizes empirical risk), $\mathbf{h_{\mathcal{H}}}$: Best possible predictor in $\mathbf{\mathcal{H}}$ (minimizes true risk), $\mathbf{R(\hat{h})}$: True Risk (actual error) of the selected predictor $\mathbf{\hat{h}}$, $\mathbf{R_{\mathcal{V}^m}(\hat{h})}$: Empirical Risk (measured error) on validation set $\mathbf{\mathcal{V}^m}$, $\mathbf{\mathcal{H}}$: Hypothesis Space 
\subsection*{1.1 Type A: Single Model Test (Hoeffding)}
\textbf{Recognize:} "Size of test set", "Fixed prediction strategy", "Confidence interval", "Estimate $R(h)$". \\
\textbf{Key:} \textbf{One} model only ($|\mathcal{H}|$ is irrelevant). Watch the \textbf{Range} $R$.

\textbf{Example: Sequence Prediction (Hamming Loss)} \\
\textbf{Given:} Predict seq. length $n=10$. Loss = Hamming distance (sum of errors). \\
\textbf{Range $R$:} Loss $\in [0, n]$, so $R=n=10$. (If "capped at 5", then $R=5$).

\textbf{a) Find Confidence $\delta$ (Failure Prob)} \\
\textbf{Params:} $l=250$ (samples), $\epsilon=1$ (error margin). \\
\textbf{Formula:} $\delta \le 2e^{-2l\epsilon^2/R^2}$ \\
\textbf{Calc:} $\delta \le 2 \exp\left(\frac{-2 \cdot 250 \cdot 1^2}{10^2}\right) = 2e^{-5} \approx \mathbf{0.0135}$

\textbf{b) Find Sample Size $l$ (Test Set)} \\
\textbf{Params:} $\epsilon=1, \delta=0.05$, \textbf{Formula:} $l \ge \frac{R^2}{2\epsilon^2} \ln\left(\frac{2}{\delta}\right)$ \\
\textbf{Calc:} $l \ge \frac{100}{2} \ln(40) = 50 \cdot 3.69 \approx \mathbf{185}$

\textbf{c) Find Error Margin $\epsilon$} \\
\textbf{Params:} $l=20,000, \delta=0.05$. (Assume Binary Loss $R=1$). \\
\textbf{Formula:} $\epsilon = R \sqrt{\frac{\ln(2/\delta)}{2l}}$, \textbf{Calc:} $\epsilon = 1 \cdot \sqrt{\frac{3.69}{40,000}} \approx \mathbf{0.0096} \ (0.96\%)$

\subsection*{1.2 Type B: Uniform Guarantee (ULLN)}
\textbf{Recognize:} "For \textbf{every} $h \in \mathcal{H}$", "Size of validation set" (checking all), "Uniformly". \\
\textbf{Key:} $|\mathcal{H}|$ matters. We ensure the entire list is accurate.

\textbf{Example: CNN Validation (100 Epochs)} \\
\textbf{Given:} $|\mathcal{H}| = 100$ classifiers. $\delta=0.05$. Range $R=1$.

\textbf{a) Sample Size $m$ (Uniform Interval)} \\
\textbf{Goal:} Ensure $|R_{val} - R_{true}| \le \epsilon$ for \textbf{all} $h$. \\
\textbf{Params:} $\epsilon=0.01$. \\
\textbf{Formula:} $m \ge \frac{R^2}{2\epsilon^2} \ln\left(\frac{2|\mathcal{H}|}{\delta}\right)$ \\
\textbf{Calc:} $m \ge \frac{1}{2(10^{-4})} \ln\left(\frac{200}{0.05}\right) = 5000 \cdot \ln(4000) \approx \mathbf{41,470}$

\subsection*{1.3 Type C: Model Selection (PAC / Excess Risk)}
\textbf{Recognize:} "Selected vs Best", "Estimation Error", "$R(\hat{h}) \le R(h^*) + \epsilon$", "Size of validation set". \\
\textbf{Key:} Requires tighter precision ($\epsilon/2$) or deals with Parameter Learning.

\textbf{Example 1: CNN Selection (Validation)} \\
\textbf{Goal:} Selected $\hat{h}$ is at most $1\%$ worse than best $h^*$. \\
\textbf{Params:} $|\mathcal{H}|=100, \epsilon=0.01, \delta=0.05$. \\
\textbf{Formula:} $m \ge \frac{2R^2}{\epsilon^2} \ln\left(\frac{2|\mathcal{H}|}{\delta}\right)$ \quad (Derived from Type B with $\epsilon/2$) \\
\textbf{Calc:} $m \ge \frac{2}{(0.01)^2} \ln(4000) = 20,000 \cdot 8.29 \approx \mathbf{165,880}$

\textbf{Example 2: Fixed Tree (Parameter Learning)} \\
\textbf{Given:} Input space partitioned into $M=128$ regions. $h(x) = \sum c_r \llbracket x \in R_r \rrbracket$. \\
\textbf{Goal:} Learn Params $c \in \{-1, +1\}^M$. Estimation error $\le \epsilon$.

\textbf{a) ERM Algorithm \& Formula} \\
\textbf{Algo:} Majority vote in each region. \\
\textbf{Formula:} $c_r = \text{sign}\left( \sum_{i: x_i \in R_r} y_i \right)$

\textbf{b) Sample Size $m$ (Training)} \\
\textbf{Params:} $\epsilon=0.05, \delta=0.1$. \\
\textbf{Hypothesis Size:} $|\mathcal{H}| = 2^M = 2^{128}$. \\
\textbf{Formula:} $m \ge \frac{2}{\epsilon^2} \ln\left(\frac{2|\mathcal{H}|}{\delta}\right)$ \quad (Std. Realizable/Agnostic Bound) \\
\textbf{Calc:} $\frac{2}{0.0025} (\ln 20 + 128 \ln 2) \approx 800 (3 + 88.7) \approx \mathbf{73,360}$

\textbf{c) PAC Learnable?} \\
\textbf{YES.} Finite hypothesis class ($2^M$). ERM on finite $\mathcal{H}$ is always PAC.

\textbf{d) Approx. Error = 0? (Multivariate Normal)} \\
\textbf{NO.} Normal dists have quadratic boundaries. Tree regions are axis-aligned boxes. A box cannot perfectly fit a curve.
%% ============================================
%% CATEGORY 2: PARAMETER ESTIMATION
%% ============================================
\section*{\underline{Parameter Estimation}}

\subsection*{2.1 Radial Exp. Distribution}
\textbf{Given:} $p(x) = \frac{1}{2\pi} e^{-\|x-\mu\|}$ in $\mathbb{R}^2$.

\textbf{a) Maximum Likelihood Estimator} \\
The MLE is $\mu_{MLE} = \arg \max_{\mu} \prod_{i=1}^m \frac{1}{2\pi} e^{-\|x_i - \mu\|}$. \\
Taking the log: $\sum_{i=1}^m \left( \ln(\frac{1}{2\pi}) - \|x_i - \mu\| \right) \rightarrow \max_{\mu}$. 
\\Ignoring constants: $\sum_{i=1}^m \|x_i - \mu\| \rightarrow \min_{\mu}$. \\
\textbf{Gradient:} $\nabla_\mu \ell = \sum_{i=1}^m \frac{x_i - \mu}{\|x_i - \mu\|}$. \\ Setting to zero yields the \textbf{Geometric Median} (spatial median).

\textbf{Does not have a closed form solution}

\textbf{b) Bayes Predictor \& VC Dim} \\
The Bayes predictor is $h(x) = \arg \max_{k} e^{-\|x - \mu_k\|} = \arg \min_{k} \|x - \mu_k\|$. \\
\textbf{Boundary:} Perpendicular bisector of $\mu_1, \mu_2$ (linear). \\
\textbf{VC Dim:} Linear classifiers in $\mathbb{R}^d$ have $\text{VC} = d + 1 = 3$.

\subsection*{2.2 Bernoulli Distribution (MLE \& Bias)}
\textbf{Given:} Dataset $\mathcal{T}_m = \{x_1, \dots, x_m\}$ where $x_i \in \{0,1\}$. \\
\textbf{Model:} $p(x) = \beta^x (1 - \beta)^{1-x}$ (Bernoulli).

\textbf{a) MLE Estimation}, 
\textbf{Likelihood:} $L(\beta) = \prod_{i=1}^m \beta^{x_i} (1-\beta)^{1-x_i}$ \\
\textbf{Log-Likelihood:} $\ell(\beta) = \sum_{i=1}^m \left( x_i \ln \beta + (1-x_i) \ln(1-\beta) \right)$ \\
\textbf{Derivative:} $\frac{\partial \ell}{\partial \beta} = \frac{\sum x_i}{\beta} - \frac{m - \sum x_i}{1-\beta}$ \\
\textbf{Optimal $\beta$:} Set $\frac{\partial \ell}{\partial \beta} = 0 \Rightarrow \hat{\beta}_{MLE} = \frac{1}{m} \sum_{i=1}^m x_i$ (Sample Mean).

\textbf{b) Bias Check},
\textbf{Task:} Determine if unbiased, i.e., check if $\mathbb{E}[\hat{\beta}] = \beta$. \\
\textbf{Proof:} $\mathbb{E}[\hat{\beta}] = \mathbb{E}\left[ \frac{1}{m} \sum x_i \right] = \frac{1}{m} \sum \mathbb{E}[x_i]$. \\
Since $x_i \sim \text{Bernoulli}(\beta)$, we know $\mathbb{E}[x_i] = \beta$. 
$\Rightarrow \mathbb{E}[\hat{\beta}] = \frac{1}{m} (m \cdot \beta) = \beta$. \\
\textbf{Result:} The estimator is \textbf{Unbiased}.\\
\textbf{c) Exponential Family Form} \\
\textbf{Rewrite:} $p(x) = \exp( x \ln \frac{\beta}{1-\beta} + \ln(1-\beta) )$. \\
\textbf{Match:} $\exp(\theta \phi(x) - A(\theta))$. \\
\textbf{Params:} $\phi(x)=x, \ h(x)=1, \ \theta = \ln \frac{\beta}{1-\beta}, \ A(\theta) = \ln(1+e^\theta)$.
\subsection*{2.3 Gaussian KL Approx}
\textbf{Task:} Find $\mu, \sigma$ for $q(x)$ to approx fixed $p(x)$ (mean $\mu_0$, var $\sigma_0^2$) by min $D_{KL}(p||q)$.

\textbf{Logic:} Minimizing $D_{KL}(p||q)$ is equivalent to maximizing $\mathbb{E}_{p}[\ln q(x)]$. For Gaussian $q$, this requires moment matching.

\textbf{Result:} $\mathbf{\mu = \mu_0} \quad \mathbf{\sigma = \sigma_0}$

\subsection*{2.4 1-NN Regression: Bias-Variance Decomposition}
\textbf{Given:} Regression problem $y = f(x) + \epsilon$, noise $\epsilon \sim (0, \sigma^2)$. \\
\textbf{Model:} 1-Nearest Neighbor $h_m(x) = y_{n(x)} = f(x_{n(x)}) + \epsilon_{n(x)}$, where $n(x)$ is the index of the nearest neighbor. \\
\textbf{Assumption:} Fixed design (inputs $x_i$ are fixed/deterministic, randomness is only in $\epsilon$).

\textbf{a) Expected Predictor $g_m(x)$} \\
\textbf{Def:} $g_m(x) = E_{\mathcal{T}^m}[h_m(x)]$ \\
\textbf{Calc:} $E_\epsilon[f(x_{n(x)}) + \epsilon_{n(x)}] = f(x_{n(x)}) + E[\epsilon] = f(x_{n(x)})$. \\
\textit{Note: $x_{n(x)}$ is fixed, so $f(x_{n(x)})$ is constant.}

\textbf{b) Squared Bias ($E_x[(g_m(x) - f(x))^2]$)} \\
\textbf{Result:} $E_x[(f(x_{n(x)}) - f(x))^2]$. \\
\textit{Interpretation:} Bias depends entirely on how far the nearest neighbor is from the query point $x$. If $x_{n(x)} \approx x$ (dense data), bias is small.

\textbf{c) Variance ($Var_{\mathcal{T}^m}(h_m(x))$)} \\
\textbf{Def:} $E[(h_m(x) - g_m(x))^2]$ or $Var(f(x_{n(x)}) + \epsilon_{n(x)})$ \\
\textbf{Calc:} $Var(\text{const} + \epsilon) = Var(\epsilon) = \sigma^2$. \\
\textbf{Result:} $\mathbf{\sigma^2}$. \\
\textit{Insight:} 1-NN does not average data (unlike k-NN), so it does not reduce noise variance. The error variance equals the irreducible noise.


\subsection*{2.5 Sequence Boundary Detection}
\textbf{Given:} Sequence $x_1, \dots, x_n$ with changepoint $k$. \\
Model: $x_{1:k} \sim \mathcal{N}(\mu_1, \sigma_1^2)$, $x_{k+1:n} \sim \mathcal{N}(\mu_2, \sigma_2^2)$, prior $p(k) = \pi_k$.

\textbf{Task 1: MLE with observed boundaries $(x^j, k_j)$.}

$\pi_k = \frac{\text{count}(k_j = k)}{m}$, \quad $\mu_1 = \text{avg}(x_i^j \text{ where } i \le k_j)$, \quad $\mu_2 = \text{avg}(x_i^j \text{ where } i > k_j)$

$\sigma_r^2 = \frac{1}{N_r} \sum (x_i^j - \mu_r)^2$ for $r=1$ ($i \le k_j$) and $r=2$ ($i > k_j$)

\textbf{Task 2: Optimal $k^*$ under quadratic loss $\ell = (k - k')^2$.}

Bayes optimal = posterior mean: $k^* = \mathbb{E}[k|x] = \sum_{k=0}^n k \cdot p(k|x)$

where $p(k|x) \propto \pi_k \prod_{i=1}^k \mathcal{N}(x_i|\mu_1, \sigma_1^2) \prod_{i=k+1}^n \mathcal{N}(x_i|\mu_2, \sigma_2^2)$

\subsection*{2.6 Generative Gaussian vs Linear Classifier}
\textbf{Given:} Class-conditional $p(x|y) = \mathcal{N}(\mu_y, C_y)$ (Multivariate Normal), hypothesis $\mathcal{H}$: linear classifiers, algo: ERM.

\textbf{a) Approx. error if $C_+ = C_-$?} (LDA) \\
Log-ratio $\ln \frac{p(x|+)}{p(x|-)}$ cancels quadratic terms → boundary is linear → $h^* \in \mathcal{H}$. \\
\textbf{Result:} $\epsilon_{app} = 0$ ($\epsilon_{app}$ is the err of best possible classif. - bayesian)

\textbf{b) Approx. error if $C_+ \neq C_-$?} (QDA) \\
Quadratic terms don't cancel → boundary is quadratic → $h^* \notin \mathcal{H}$. \\
\textbf{Result:} $\epsilon_{app} > 0$

\textbf{c) Statistically consistent?} \textbf{Result:} No (model misspec)
ERM converges to best linear $h_\mathcal{H}$, but $R(h_\mathcal{H}) > R(h^*)$ when $C_+ \neq C_-$. \\
\textbf{d) Pac Learnable?}
Is hypothesis space finite -> \textbf{YES} \\
Is VC dim finite -> \textbf{YES} (hypothesis space CAN then be infinite) 

\subsection*{2.7 Poisson Exp. Family \& MLE}
\textbf{Task:} Show Poisson $p(k) = \frac{\lambda^k e^{-\lambda}}{k!}$ is Exp. Family. Find MLE for natural param $\eta$ given avg $\bar{k}$.

\textbf{a) Exp. Family Form} (Rewrite via $\lambda^k = e^{k \ln \lambda}$): \\
$p(k) = \frac{1}{k!} \exp[k \log \lambda - \lambda] = h(k) \exp[k\eta - A(\eta)]$

Natural param: $\eta = \log \lambda$ \quad Log-partition: $A(\eta) = e^\eta$ \quad Sufficient Statistic: $T(k)=k$ \quad Base: $h(k) = \frac{1}{k!}$

\textbf{b) MLE Derivation} (Maximize log-likelihood):\\
$\mathcal{L}(\eta)=\sum_{i=1}^{m}log\ p(k_i|\eta),\ log\ p(k_i|\eta)=log(h(k))+\eta k - A(\eta) \xrightarrow[]{\text{wrt}\ \eta}\sum_{i=1}^m (\eta k_i-A(\eta))$
Objective: $\eta \bar{k} - A(\eta) \rightarrow \max_{\eta}$ \\
Differentiate: $\frac{\partial}{\partial \eta} (\eta \bar{k} - e^\eta) = \bar{k} - e^\eta = 0$, \textbf{Solution:} $\eta = \log \bar{k}$.
\textbf{b2) MLE Derivation} (Maximize likelihood):\\
$L(\lambda) = \prod_{j=1}^m \frac{\lambda^{x_j}e^{-\lambda}}{x_j!} \implies \mathcal{L}(\lambda) = \ln L(\lambda) = \sum_{j=1}^m (x_j \ln \lambda - \lambda - \ln x_j!).$ \\
Differentiate: $\frac{\partial \mathcal{L}}{\partial \lambda} = \sum_{j=1}^m \left(\frac{x_j}{\lambda} - 1\right) = \frac{1}{\lambda}\sum_{j=1}^m x_j - m = 0$. \\
\textbf{Solution:} $\hat{\lambda} = \frac{1}{m}\sum_{j=1}^m x_j$.

\textbf{c) Compute Fisher Information}: (assuming mean equals variance)\\
1. log-likelihood of one element: $\ln p(x; \lambda) = x \ln \lambda - \lambda - \ln(x!)$ \\
2. first derivation: $\frac{\partial}{\partial \lambda} \ln p(x; \lambda) = \frac{x}{\lambda} - 1$ \\
3. second derivation: $\frac{\partial^2}{\partial \lambda^2} \ln p(x; \lambda) = -\frac{x}{\lambda^2}$ \\
4. fisher information: $I(\lambda) = -\mathbb{E}\left[ \frac{\partial^2}{\partial \lambda^2} \ln p(X; \lambda) \right] = \mathbf{\frac{1}{\lambda}}$

\textbf{d) Compute Minimum Sample Count} so that $\epsilon < 0.1$, $\delta > 0.99$: \\
1. Asymptotic Normality: $\hat{\lambda}_{MLE} \sim \mathcal{N}\left(\lambda, \frac{1}{m I(\lambda)}\right) = \mathcal{N}\left(\lambda, \frac{\lambda}{m}\right)$ \\
2. Confidence Interval: $P(|\hat{\lambda} - \lambda| \leq \varepsilon) \geq 0.99 \implies \frac{\varepsilon}{\sqrt{\lambda/m}} \geq z_{0.995} \approx 2.576$ \\
3. Solve for m: $\sqrt{m} \geq \frac{2.576 \sqrt{\lambda}}{\varepsilon} \implies m \geq \lambda \left( \frac{2.576}{\varepsilon} \right)^2$ \\
4. Worst Case Assumption: $\lambda < 1 \implies \max(\lambda) \to 1$ \\
5. Calculation: $m \geq 1 \cdot \left( \frac{2.576}{0.1} \right)^2 \approx 663.57 \implies m = 664$

%% ============================================
%% CATEGORY 3: NEURAL NETWORKS
%% ============================================
\section*{\underline{Neural Networks}}

\subsection*{3.1 Backpropagation}
\textbf{Given:} $h = \tanh(s)$, $a_j = \text{relu}(z_j)$, Loss $\ell = \frac{1}{2}(y-h)^2$. \\
\textbf{Hints:} $\tanh' = 1-\tanh^2(s)$, $\text{relu}(s) = \max(0, s)$.

\textbf{Derivation:}
\begin{enumerate}
    \item \textbf{Output Error Signal:} $\delta_{out} = \frac{\partial \ell}{\partial h} \cdot \frac{\partial h}{\partial s} = -(y-h)(1-\tanh^2(s))$
    \item \textbf{Layer 2 ($w_j^{(2)}$):} Input is $a_j$. \\
    $\frac{\partial \ell}{\partial w_j^{(2)}} = \delta_{out} \cdot a_j = \mathbf{-(y-h)(1-\tanh^2(s)) \cdot \text{relu}(z_j)}$
    \item \textbf{Layer 1 ($w_{ij}^{(1)}$):} Backprop error through weights $w_j^{(2)}$ and activation. \\
    (Note: Deriv of $\text{relu}(z_j)$ is 1 if $z_j > 0$, else 0). \\
    $\frac{\partial \ell}{\partial w_{ij}^{(1)}} = \mathbf{-(y-h)(1-\tanh^2(s)) \cdot w_j^{(2)} \cdot \begin{cases} 1 & \text{if } z_j > 0 \\ 0 & \text{else} \end{cases} \cdot x_i}$
\end{enumerate}

\subsection*{3.2 Median Pooling (2x2 Backward)}
\textbf{Given:} Window size $2\times2=4$ (even). Median is avg of two middle sorted vals $x_{(2)}, x_{(3)}$.
$f_{kl} = 0.5(x_{(2)} + x_{(3)})$

\textbf{Gradient:}
Splits between the middle elements.
$\frac{\partial f}{\partial x_{ij}} = \begin{cases} 0.5 & \text{if } x_{ij} \in \{x_{(2)}, x_{(3)}\} \\ 0 & \text{else} \end{cases}$

\subsection*{3.3 Neural Module (Linear + ELU)}
\textbf{Task:} Define module with $n$ inputs, $K$ linear+ELU units. \\
ELU: $f(z) = z$ if $z>0$, else $e^z - 1$. Deriv: $f'(z) = 1$ if $z>0$, else $e^z$.

\textbf{Forward Message} (layer outputs as function of inputs): \\
$z_k = \sum_{j=1}^n w_{kj} x_j + b_k$, \quad $y_k = f(z_k)$ for $k = 1, \dots, K$.

\textbf{Backward Message} (derivs of all outputs w.r.t. all inputs): \\
$\frac{\partial y_k}{\partial x_j} = \frac{\partial y_k}{\partial z_k} \cdot \frac{\partial z_k}{\partial x_j} = f'(z_k) \cdot w_{kj}$ for all $k, j$.

\textbf{Parameter Message} (derivs of all outputs w.r.t. all params): \\
$\frac{\partial y_k}{\partial w_{kj}} = f'(z_k) \cdot x_j$, \quad $\frac{\partial y_k}{\partial b_k} = f'(z_k)$ for all $k, j$.

\subsection*{3.4 Backprop (Sigmoid + BCE)}
\textbf{Given:} Input $x \in \mathbb{R}^n$, target $y \in \{0,1\}$, weights $w \in \mathbb{R}^n$. \\
Forward: $s = \sum w_i x_i$, \quad $\hat{y} = \sigma(s) = \frac{1}{1+e^{-s}}$ (network output). \\
Loss (BCE): $l = -y \log\hat{y} - (1-y)\log(1-\hat{y})$.

\textbf{Task 1: Derive gradient $\nabla l(w)$ via backprop messages.}

$\frac{\partial l}{\partial \hat{y}} = -\frac{y}{\hat{y}} + \frac{1-y}{1-\hat{y}} = \frac{\hat{y}-y}{\hat{y}(1-\hat{y})}$, \quad
$\frac{\partial \sigma}{\partial s} = \frac{e^{-s}}{(1+e^{-s})^2}$, \quad
$\frac{\partial s}{\partial w_i} = x_i$

Chain rule: $\frac{\partial l}{\partial w_i} = \frac{\partial l}{\partial \hat{y}} \cdot \frac{\partial \sigma}{\partial s} \cdot x_i$

\textbf{Task 2: Simplify using forward pass activity.}

$\frac{d\sigma}{ds} = \frac{1+e^{-s}}{(1+e^{-s})^2} - \frac{1}{(1+e^{-s})^2} = \sigma(s) - \sigma(s)^2 = \hat{y}(1-\hat{y})$

Substitute: $\frac{\partial l}{\partial \hat{y}} \cdot \frac{\partial \sigma}{\partial s}  = \frac{\partial l}{\partial \hat{y}} \cdot \hat{y}(1-\hat{y}) = \frac{\hat{y}-y}{\hat{y}(1-\hat{y})} \cdot \hat{y}(1-\hat{y}) = \hat{y} - y$

\textbf{Final:} $\boxed{\frac{\partial l}{\partial w_i} = (\hat{y} - y) x_i}$


\subsection*{3.5 Prior Probability Shift (Label Shift)}
\textbf{Given:} A trained classifier outputting posteriors $p(k|x)$ based on training priors $p(k)$. The class-conditional distribution $p(x|k)$ remains constant, but the application domain has different priors $p_a(k)$.

\textbf{Task 1: Derive adjusted prediction rule for new domain.}

Using Bayes' Theorem, express the new posterior $p_a(k|x)$ by isolating the invariant appearance model $p(x|k)$:
\begin{enumerate}
    \item From training: $p(x|k) = \frac{p(k|x)p(x)}{p(k)}$
    \item In application: $p_a(k|x) = \frac{p(x|k)p_a(k)}{p_a(x)}$
\end{enumerate}
Substitute (1) into (2) and ignore constants $p(x), p_a(x)$ to find proportionality:
$$ p_a(k|x) \propto p(k|x) \cdot \frac{p_a(k)}{p(k)} $$

\textbf{Task 2: Calculate normalized adjusted probabilities.}

To ensure $\sum_k p_a(k|x) = 1$, use the following adjustment formula for each class $k$:
$$ p_a(k|x) = \frac{p(k|x) \frac{p_a(k)}{p(k)}}{\sum_{j=1}^K p(j|x) \frac{p_a(j)}{p(j)}} $$

\vspace{-0.4cm}
\textbf{Key Intuition:}
\begin{itemize}
    \item $\frac{p_a(k)}{p(k)} > 1$: Class $k$ is more frequent now $\rightarrow$ \textbf{Boost} probability.
    \item $\frac{p_a(k)}{p(k)} < 1$: Class $k$ is rarer now $\rightarrow$ \textbf{Suppress} probability.
\end{itemize}
\subsection*{3.6: Conv Layer Params \& Dimensions}

\textbf{(1) Parameter Types \& Count:}
Filters have volume $F \times F \times C$. There are $D$ filters. Filter uses stride $S$.
\newline
Weights: $D \times (F \times F \times C)$, Biases: $D \times 1$.
\newline
Total: $D(F^2 C) + D = \boxed{D(F^2 C + 1)}$

\textbf{(2) Padding $P$ for preserved width ($W_{in} = W_{out}$):}
\newline
Dim Formula: $W_{out} = \frac{W_{in} + 2P - F}{S} + 1$. Substitute $W_{out} \to W_{in}$:
\newline
$W_{in} - 1 = \frac{W_{in} + 2P - F}{S} \implies S(W_{in} - 1) = W_{in} + 2P - F$
\newline
\textbf{Final:} $\boxed{P = \frac{S(W_{in} - 1) - W_{in} + F}{2}}$
%% ============================================
%% CATEGORY 4: ALGORITHMS
%% ============================================
\section*{\underline{Algorithms}}

\subsection*{4.1 Regression Tree Split}
\textbf{Task:} Build regression tree, compute (greedy) first split iteration. \\
\textbf{Rule:} Split on one var to minimize $SSE = \sum(y-\bar{y})^2$. \\
\textbf{Data:} A$(x_1,x_2,y)$: A$(4,7,4)$, B$(2,7,8)$, C$(10,1,3)$.

\textbf{Split $x_1 \le 3$:} Left $\{B\}$: SSE=0. Right $\{A,C\}$: mean 3.5, SSE=0.5. \textbf{Total=0.5} \\
\textbf{Split $x_2 \le 4$:} Left $\{C\}$: SSE=0. Right $\{A,B\}$: mean 6, SSE=8. Total=8. \\
\textbf{Result:} Split on $x_1$ at 3 (lowest SSE).
\subsection*{4.2 Gradient Boosting (GBM)}
\textbf{Given:} Data $\{(x_i, y_i)\}$, loss $L(y, F)$, iterations $M$, learning rate $\nu$.

\textbf{Task 1: Generic GBM Algorithm.}

\textbf{Init:} $F_0(x) = \arg\min_c \sum_i L(y_i, c)$ ($c$ is best constant val). \\
\textbf{For} $m=1$ to $M$: \\
\quad (1) Pseudo-residuals: $r_{im} = -\frac{\partial L(y_i, F(x_i))}{\partial F(x_i)}\big|_{F=F_{m-1}}$ \\
\quad (2) Fit weak learner $h_m$ on $\{(x_i, r_{im})\}$ \\
\quad (3) Line search: $\gamma_m = \arg\min_\gamma \sum_i L(y_i, F_{m-1}(x_i) + \gamma h_m(x_i))$ \\
\quad (4) Update: $F_m(x) = F_{m-1}(x) + \nu \gamma_m h_m(x)$

\textbf{Task 2: Derive pseudo-residuals ($r = -\partial L / \partial F$).}

\textbf{Squared Log Loss:} $L(y, F) = \frac{1}{2}(\log(1+y) - \log(1+F))^2$

$\frac{\partial L}{\partial F} = -\frac{\log(1+y) - \log(1+F)}{1+F}$ \quad $\Rightarrow$ \quad $r = \frac{\log(1+y) - \log(1+F)}{1+F}$

\textbf{Quantile (Pinball) Loss:} $L_\tau = \tau(y-F)$ if $y \ge F$, else $(\tau-1)(y-F)$

$r = -\frac{\partial L}{\partial F} = \begin{cases} \tau & y \ge F \text{ (underest.)} \\ \tau-1 & y < F \text{ (overest.)} \end{cases}$

\textbf{Log-Cosh Loss:} $L(y, F(x)) = \log(\cosh(F(x) - y))$

$\frac{\partial L}{\partial F(x)} = \frac{\sinh(F(x) - y)}{\cosh(F(x) - y)} = \tanh(F(x) - y)$ \quad $\Rightarrow$ \quad $r = -\tanh(F(x) - y)$
\subsection*{4.3 Constrained Viterbi ($s_k = $ 'q')}
\textbf{Given:} Markov chain with $p(s) = p(s_1) \prod_{i=2}^n p(s_i|s_{i-1})$, alphabet $\mathcal{A}$. \\
\textbf{Task:} $\arg\max_s p(s)$ s.t. $s_k = $ 'q'. \\
\textbf{Algorithm:} (1) Forward: $\delta_i(x) = \max_{x'} [\delta_{i-1}(x') p(x|x') p(x)]$. (2) At step $k$: set $\delta_k(x) = 0$ for $x \neq $ 'q' (force constraint). (3) Continue Viterbi for $k+1 \dots n$. (4) Backtrack from $\arg\max \delta_n(x)$. 
\textbf{Complexity:} $O(n \cdot |\mathcal{A}|^2)$ (same as standard).

\subsection*{4.4 Kernel Perceptron}
\textbf{Given:} Feature map $\phi(x)$ unknown, but kernel $k(x,x')=\langle \phi(x), \phi(x') \rangle$ known. \\
Perceptron (linear classifier): $h(x) = \text{sign}(w \cdot \phi(x) + b)$, where $w = \sum_i \alpha_i y^i \phi(x^i)$.

\textbf{Task 1: Derive prediction rule using kernel.}

Substitute $w$ into decision function:
%{\fontsize{5pt}{7pt}\selectfont$$ h(x) = \text{sign}\left( \sum_{i=1}^m \alpha_i y^i \langle \phi(x^i), \phi(x) \rangle + b \right) = \text{sign}\left( \sum_{i=1}^m \alpha_i y^i k(x^i, x) + b \right) $$}

{\fontsize{5pt}{7pt}\selectfont$$ h(x) = \text{sign}\left( \sum_{i=1}^m \alpha_i y^i \langle \phi(x^i), \phi(x) \rangle + b \right) = \text{sign}\left( \sum_{i=1}^m \alpha_i y^i k(x^i, x) + b \right) $$}

\textbf{Task 2: Training algorithm to find $\alpha, b$.}


\textbf{Loop} until convergence, for each $(x^t, y^t)$, \textbf{Init:} $\alpha_i = 0$ for all $i$, $b=0$:
\begin{itemize}
    \item Predict: $\hat{y} = \text{sign}\left(\sum_{i=1}^m \alpha_i y^i k(x^i, x^t) + b\right)$
    \item If mistake ($\hat{y} \neq y^t$): $\alpha_t \leftarrow \alpha_t + 1$, \quad $b \leftarrow b + y^t$
\end{itemize}
\subsection*{4.5 Mixture of Markov Models (EM Algorithm)}
\textbf{Model:} $p(s) = \pi p_1(s) + (1-\pi)p_2(s)$. \\
\textbf{Latent:} $z^j \in \{1, 2\}$ (component assignment). \\
\textbf{Params:} $\pi$, Initial $\rho_c(u)$, Transition $A_c(u,v)$ for $c \in \{1,2\}$.

\textbf{1. E-Step (Responsibilities):}
Compute $\gamma_{jc} = P(z^j=c|s^j)$.
$$ \gamma_{j1} = \frac{\pi p_1(s^j)}{\pi p_1(s^j) + (1-\pi) p_2(s^j)}, \quad \gamma_{j2} = 1 - \gamma_{j1} $$
Likelihood: $p_c(s^j) = \rho_c(s^j_1) \prod_{t=2}^n A_c(s^j_{t-1}, s^j_t)$.

\textbf{2. M-Step (Parameter Updates):}
\textit{Use weighted counts where weights are $\gamma_{jc}$.}

\textbf{Mixture:} $\pi = \frac{1}{m} \sum_{j=1}^m \gamma_{j1}$
\textbf{Initial:} $\rho_c(u) = \frac{\sum_{j=1}^m \gamma_{jc} \llbracket(s^j_1=u)\rrbracket}{\sum_{j=1}^m \gamma_{jc}}$

\textbf{Transition:} $A_c(u,v) = \frac{\sum_{j=1}^m \gamma_{jc} \sum_{t=2}^n \llbracket(s^j_{t-1}=u, s^j_t=v)\rrbracket}{\sum_{j=1}^m \gamma_{jc} \sum_{t=2}^n \llbracket(s^j_{t-1}=u)\rrbracket}$
\subsection*{4.6 Exp. Occurrences in Markov Chain}
\textbf{Task:} Given homogeneous Markov model $p(s_1)$ and trans. $p(s_i|s_{i-1})$.
Find expected count of state $k^*$ in sequence of length $n$: $\mathbb{E}[n(k^*)]$.

\textbf{Solution:} Linearity of Expectation.
$$ \mathbb{E}[n(k^*)] = \mathbb{E}\left[\sum_{i=1}^n \llbracket(s_i = k^*)\rrbracket\right] = \sum_{i=1}^n P(s_i = k^*) $$

\textbf{Algorithm (Marginals):}
1. Let vector $\mathbf{v}_i$ be marginal dist. at step $i$.
2. Init: $\mathbf{v}_1$ from given $p(s_1)$.
3. Iterate: $\mathbf{v}_i = \mathbf{v}_{i-1} \mathbf{T}$ for $i=2 \dots n$ (where $\mathbf{T}$ is transition matrix).
4. \textbf{Result:} Sum the $k^*$-th component of all $\mathbf{v}_i$.
$ \text{Total} = \sum_{i=1}^n \mathbf{v}_i[k^*] $
\textit{Complexity:} $\mathcal{O}(nK^2)$ (matrix-vector multiplication).

\subsection*{4.7 Markov transitions after n states}
Task: Given matrix $P_{k,k'}$ and init state $k=1$ compute marg. probs after n transitions.

$
P_{k,k'}=0.5\cdot
\begin{pmatrix}
1 & 0 & 0 \\
1 & 1 & 0 \\
0 & 1 & 2
\end{pmatrix}, \pi_0^T=(1, 0, 0)$. We compute $P^3\pi_0$, then $\pi_3^T=\frac{1}{8}(1, 3, 4)$
\subsection*{4.8 Mixture of Coins (Bernoulli, EM algorithm)}
\textbf{Model:} $x \in \{H, T\}$. Latent $z \in \{\text{Fair}, \text{Fake}\}$. 
\textbf{Params:} $\beta = P(\text{Fair})$, $\alpha = P(H|\text{Fake})$. \\
\textbf{Fixed:} $P(H|\text{Fair}) = 0.5$.

\textbf{1. E-Step (Responsibilities):} 
Compute probability that observation $x_i$ is \textbf{Fair} ($\gamma_i$).
$$ \gamma_i = P(\text{Fair}|x_i) = \frac{P(x_i|\text{Fair})\beta}{P(x_i|\text{Fair})\beta + P(x_i|\text{Fake})(1-\beta)} $$
\textit{Case $x_i = H$:} \quad $\gamma_i = \frac{0.5\beta}{0.5\beta + \alpha(1-\beta)}$, \\
\textit{Case $x_i = T$:} \quad $\gamma_i = \frac{0.5\beta}{0.5\beta + (1-\alpha)(1-\beta)}$

\textbf{2. M-Step (Updates):} 
Let $w_i = 1 - \gamma_i$ be the weight for the \textbf{Fake} coin.

$ \beta_{n} = \frac{1}{m} \sum_{i=1}^m \gamma_i \quad (\text{Proportion Fair}) $\\
$ \alpha_{n} = \frac{\sum_{i=1}^m w_i \cdot \llbracket x_i=H\rrbracket}{\sum_{i=1}^m w_i} \quad (\text{Weighted Heads for Fake}) $


\subsection*{4.9 Hidden markov model (HMM)}

Consider an HMM with hidden states $s_t \in \{A, B\}$ and binary observations $x_t \in \{0, 1\}$.

\begin{itemize} \setlength\itemsep{0em}
    \item \textbf{Initial Distribution:} Uniform.\\
    \textbf{Transition Matrix:}
    $
    P(s_t \mid s_{t-1}) = 
    \begin{bmatrix} 
    \beta & 1-\beta \\ 
    1-\beta & \beta 
    \end{bmatrix}
    $
    \item \textbf{Emission Probabilities:}
    \begin{align*}
        p(x_t \mid A) &= \begin{cases} \gamma & \text{if } x_t=1 \\ 1-\gamma & \text{if } x_t=0 \end{cases}\quad
        p(x_t \mid B) = \begin{cases} \delta & \text{if } x_t=1 \\ 1-\delta & \text{if } x_t=0 \end{cases}
    \end{align*}
\end{itemize}

\textbf{a) Joint Probability Expression} \\
The joint probability factorizes into the initial state, transitions, and emissions:
$$
    p(x, s) = p(s_1) \prod_{t=2}^{n} p(s_t \mid s_{t-1}) \prod_{t=1}^{n} p(x_t \mid s_t) \\
            \quad p(s_1) = 0.5
$$

\textbf{b) Calculation for $n=3$}
Given $x = (1, 0, 1)$ and $s = (A, B, A)$:
\begin{itemize}
    \item \textbf{t=1:} Start $A$, emit $1 \to 0.5 \cdot \gamma$, \textbf{Trans:} $A \to B \to (1-\beta)$
    \item \textbf{t=2:} State $B$, emit $0 \to (1-\delta)$, \textbf{Trans:} $B \to A \to (1-\beta)$
    \item \textbf{t=3:} State $A$, emit $1 \to \gamma$
\end{itemize}

\noindent \textbf{Result:}
    $P(x,s) = (0.5 \gamma) \cdot (1-\beta) \cdot (1-\delta) \cdot (1-\beta) \cdot \gamma$

%\noindent\framebox[\linewidth]{%
%  \begin{minipage}{0.95\linewidth}
%    \texttt{Notes:}
%    \vspace{2.5cm}
%  \end{minipage}%
%}

\subsection*{4.10 Markov sequences of blocks}
Task: MM with two states $s_i \in \{a,b\}$, probability matrix given by:
$p(s_i|s_{i-1})=\alpha\llbracket s_i=s_{i-1}\rrbracket+(1-\alpha)\llbracket s_i \neq s_{i-1}\rrbracket$

\textbf{$\text{a}_1$)} Most probable seq. for $\alpha > 0.5$:\quad $a^n$ or $b^n$\\
\textbf{$\text{a}_2$)} Most probable seq. for $\alpha < 0.5$:\quad $abab\cdots$ or $baba\cdots$.\\
\textbf{b)} Prob of seq with k-blocks of a's and b's
Prob of seq s with k blocks: $p(s)=p(s_1)\cdot \alpha^{n-k}\cdot (1-\alpha)^{k-1}$, $p(s_1)=0.5$ (the prob of staying $n-k$ times and switching $k-1$ times), number of possible seqs:
$\begin{pmatrix}
n - 1\\
k - 1
\end{pmatrix}$
Then: $p(S_k)=\left[2\cdot
\begin{pmatrix}
n - 1\\
k - 1
\end{pmatrix}
\right]\cdot\left[0.5\cdot \alpha^{n-k} (1-\alpha)^{k-1}\right]$ (2 for possible letters in the comb. num - a or b)

%\texttt{SSU-Exam cheatsheet by Nap/Ing.enjoyers; 2026}

%\begin{center}
%  \includegraphics[width=0.4\linewidth]{./esq.jpg}
%\end{center}

\vspace{-0.02cm}
\texttt{SSU-Exam cheatsheet by Nap/Ing.enjoyers; 2026}

\end{multicols*}

\end{document}