diff --git a/RELEASES.md b/RELEASES.md
index cbe6c84fe..7734fde9b 100644
--- a/RELEASES.md
+++ b/RELEASES.md
@@ -10,10 +10,11 @@ This new release adds support for sparse cost matrices in the exact EMD solver.
 - Add support for sparse cost matrices in EMD solver (PR #778, Issue #397)
 
 #### Closed issues
-- Fix O(nÂ³) performance bottleneck in sparse bipartite graph arc iteration  (PR #785)
+- Fix O(n³) performance bottleneck in sparse bipartite graph arc iteration  (PR #785)
 - Fix deprecated JAX function in `ot.backend.JaxBackend` (PR #771, Issue #770)
 - Add test for build from source (PR #772, Issue #764)
 - Fix device for batch Ot solver in `ot.batch` (PR #784, Issue #783)
+- Clean documentation (PR #787)
 
 ## 0.9.6.post1
 
diff --git a/docs/source/user_guide.rst b/docs/source/user_guide.rst
index 39eb72275..fd9e32e57 100644
--- a/docs/source/user_guide.rst
+++ b/docs/source/user_guide.rst
@@ -832,7 +832,7 @@ alignment between two distributions can be expressed as the one minimizing:
 
     s.t. \gamma 1 = a; \gamma^T 1= b; \gamma\geq 0
 
-where ::math:`C1` is the distance matrix between samples in the source
+where :math:`C1` is the distance matrix between samples in the source
 distribution and :math:`C2` the one between samples in the target,
 :math:`L(C1_{i,k},C2_{j,l})` is a measure of similarity between
 :math:`C1_{i,k}` and :math:`C2_{j,l}` often chosen as
diff --git a/ot/bregman/_geomloss.py b/ot/bregman/_geomloss.py
index f901663a6..73d8d5d9a 100644
--- a/ot/bregman/_geomloss.py
+++ b/ot/bregman/_geomloss.py
@@ -125,7 +125,7 @@ def empirical_sinkhorn2_geomloss(
     The algorithm used for solving the problem is the Sinkhorn-Knopp matrix
     scaling algorithm as proposed in and computed in log space for
     better stability and epsilon-scaling. The solution is computed in a lazy way
-    using the Geomloss [60] and the KeOps library [61].
+    using the Geomloss [60]_ and the KeOps library [61]_.
 
     Parameters
     ----------
diff --git a/ot/coot.py b/ot/coot.py
index 6f4cfa6f0..fd59a50b5 100644
--- a/ot/coot.py
+++ b/ot/coot.py
@@ -412,13 +412,14 @@ def co_optimal_transport2(
     warmstart : dictionary, optional (default = None)
         Contains 4 keys:
             - "duals_sample" and "duals_feature" whose values are
-            tuples of 2 vectors of size (n_sample_x, n_sample_y) and (n_feature_x, n_feature_y).
-            Initialization of sample and feature dual vectors
-            if using Sinkhorn algorithm. Zero vectors by default.
+              tuples of 2 vectors of size (n_sample_x, n_sample_y) and (n_feature_x, n_feature_y).
+              Initialization of sample and feature dual vectors
+              if using Sinkhorn algorithm. Zero vectors by default.
+
             - "pi_sample" and "pi_feature" whose values are matrices
-            of size (n_sample_x, n_sample_y) and (n_feature_x, n_feature_y).
-            Initialization of sample and feature couplings.
-            Uniform distributions by default.
+              of size (n_sample_x, n_sample_y) and (n_feature_x, n_feature_y).
+              Initialization of sample and feature couplings.
+              Uniform distributions by default.
     nits_bcd : int, optional (default = 100)
         Number of Block Coordinate Descent (BCD) iterations to solve COOT.
     tol_bcd : float, optional (default = 1e-7)
diff --git a/ot/gromov/_bregman.py b/ot/gromov/_bregman.py
index 597ac1524..5b43a6250 100644
--- a/ot/gromov/_bregman.py
+++ b/ot/gromov/_bregman.py
@@ -432,7 +432,7 @@ def BAPG_gromov_wasserstein(
              \mathbf{T} &\geq 0
 
     Else, the function solves an equivalent problem [63], where constant terms only
-    depending on the marginals :math:`\mathbf{p}`: and :math:`\mathbf{q}`: are
+    depending on the marginals :math:`\mathbf{p}` and :math:`\mathbf{q}` are
     discarded while assuming that L decomposes as in Proposition 1 in [12]:
 
     .. math::
@@ -450,7 +450,7 @@ def BAPG_gromov_wasserstein(
     - :math:`\mathbf{p}`: distribution in the source space
     - :math:`\mathbf{q}`: distribution in the target space
     - `L`: loss function to account for the misfit between the similarity matrices
-        satisfying :math:`L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)`
+      satisfying :math:`L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)`
 
     .. note:: By algorithmic design the optimal coupling :math:`\mathbf{T}`
         returned by this function does not necessarily satisfy the marginal
@@ -650,7 +650,7 @@ def BAPG_gromov_wasserstein2(
              \mathbf{T} &\geq 0
 
     Else, the function solves an equivalent problem [63, 64], where constant terms only
-    depending on the marginals :math:`\mathbf{p}`: and :math:`\mathbf{q}`: are
+    depending on the marginals :math:`\mathbf{p}` and :math:`\mathbf{q}` are
     discarded while assuming that L decomposes as in Proposition 1 in [12]:
 
     .. math::
@@ -668,7 +668,7 @@ def BAPG_gromov_wasserstein2(
     - :math:`\mathbf{p}`: distribution in the source space
     - :math:`\mathbf{q}`: distribution in the target space
     - `L`: loss function to account for the misfit between the similarity matrices
-        satisfying :math:`L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)`
+      satisfying :math:`L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)`
 
     .. note:: By algorithmic design the optimal coupling :math:`\mathbf{T}`
         returned by this function does not necessarily satisfy the marginal
@@ -1439,12 +1439,13 @@ def BAPG_fused_gromov_wasserstein(
              \mathbf{T} &\geq 0
 
     Else, the function solves an equivalent problem [63, 64], where constant terms only
-    depending on the marginals :math:`\mathbf{p}`: and :math:`\mathbf{q}`: are
+    depending on the marginals :math:`\mathbf{p}` and :math:`\mathbf{q}` are
     discarded while assuming that L decomposes as in Proposition 1 in [12]:
 
     .. math::
         \mathbf{T}^* \in\mathop{\arg\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F -
         \alpha \langle h_1(\mathbf{C}_1) \mathbf{T} h_2(\mathbf{C_2})^\top , \mathbf{T} \rangle_F
+
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
 
              \mathbf{T}^T \mathbf{1} &= \mathbf{q}
@@ -1459,7 +1460,7 @@ def BAPG_fused_gromov_wasserstein(
     - :math:`\mathbf{p}`: distribution in the source space
     - :math:`\mathbf{q}`: distribution in the target space
     - `L`: loss function to account for the misfit between the similarity and feature matrices
-        satisfying :math:`L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)`
+      satisfying :math:`L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)`
     - :math:`\alpha`: trade-off parameter
 
     .. note:: By algorithmic design the optimal coupling :math:`\mathbf{T}`
@@ -1672,12 +1673,13 @@ def BAPG_fused_gromov_wasserstein2(
              \mathbf{T} &\geq 0
 
     Else, the function solves an equivalent problem [63, 64], where constant terms only
-    depending on the marginals :math:`\mathbf{p}`: and :math:`\mathbf{q}`: are
+    depending on the marginals :math:`\mathbf{p}` and :math:`\mathbf{q}` are
     discarded while assuming that L decomposes as in Proposition 1 in [12]:
 
     .. math::
         \mathop{\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F -
         \alpha \langle h_1(\mathbf{C}_1) \mathbf{T} h_2(\mathbf{C_2})^\top , \mathbf{T} \rangle_F
+
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
 
              \mathbf{T}^T \mathbf{1} &= \mathbf{q}
@@ -1691,7 +1693,7 @@ def BAPG_fused_gromov_wasserstein2(
     - :math:`\mathbf{p}`: distribution in the source space
     - :math:`\mathbf{q}`: distribution in the target space
     - `L`: loss function to account for the misfit between the similarity and feature matrices
-        satisfying :math:`L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)`
+      satisfying :math:`L(a, b) = f_1(a) + f_2(b) - h_1(a) h_2(b)`
     - :math:`\alpha`: trade-off parameter
 
     .. note:: By algorithmic design the optimal coupling :math:`\mathbf{T}`
diff --git a/ot/gromov/_quantized.py b/ot/gromov/_quantized.py
index f4a8fafa7..861cbfd46 100644
--- a/ot/gromov/_quantized.py
+++ b/ot/gromov/_quantized.py
@@ -462,7 +462,8 @@ def format_partitioned_graph(
     with structure matrix :math:`(\mathbf{C} \in R^{n \times n}`, feature matrix
     :math:`(\mathbf{F} \in R^{n \times d}` and node relative importance
     :math:`(\mathbf{p} \in \Sigma_n`, into a partitioned attributed graph
-    taking into account partitions and representants :math:`\mathcal{P} = \left{(\mathbf{P_{i}}, \mathbf{r_{i}})\right}_i`.
+    taking into account partitions and representants
+    :math:`\mathcal{P} = \left\{(\mathbf{P_{i}}, \mathbf{r_{i}})\right\}_i`.
 
     Parameters
     ----------
@@ -966,7 +967,8 @@ def format_partitioned_samples(X, p, part, rep_indices, F=None, alpha=1.0, nx=No
     with euclidean structure matrix :math:`(\mathbf{D}(\mathbf{X}) \in R^{n \times n}`,
     feature matrix :math:`(\mathbf{F} \in R^{n \times d}` and node relative importance
     :math:`(\mathbf{p} \in \Sigma_n`, into a partitioned attributed graph
-    taking into account partitions and representants :math:`\mathcal{P} = \left{(\mathbf{P_{i}}, \mathbf{r_{i}})\right}_i`.
+    taking into account partitions and representants
+    :math:`\mathcal{P} = \left\{(\mathbf{P_{i}}, \mathbf{r_{i}})\right\}_i`.
 
     Parameters
     ----------
diff --git a/ot/gromov/_semirelaxed.py b/ot/gromov/_semirelaxed.py
index 8c60b2569..02fafc874 100644
--- a/ot/gromov/_semirelaxed.py
+++ b/ot/gromov/_semirelaxed.py
@@ -64,7 +64,6 @@ def semirelaxed_gromov_wasserstein(
     - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
     - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
     - :math:`\mathbf{p}`: distribution in the source space
-
     - `L`: loss function to account for the misfit between the similarity matrices
 
     .. note:: This function is backend-compatible and will work on arrays
@@ -883,7 +882,6 @@ def entropic_semirelaxed_gromov_wasserstein(
     - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
     - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
     - :math:`\mathbf{p}`: distribution in the source space
-
     - `L`: loss function to account for the misfit between the similarity matrices
 
     .. note:: This function is backend-compatible and will work on arrays
@@ -1070,6 +1068,7 @@ def entropic_semirelaxed_gromov_wasserstein2(
 
     Note that when using backends, this loss function is differentiable wrt the
     matrices (C1, C2) but not yet for the weights p.
+
     .. note:: This function is backend-compatible and will work on arrays
         from all compatible backends. However all the steps in the conditional
         gradient are not differentiable.
diff --git a/ot/gromov/_unbalanced.py b/ot/gromov/_unbalanced.py
index 6019c20c8..6971b6a58 100644
--- a/ot/gromov/_unbalanced.py
+++ b/ot/gromov/_unbalanced.py
@@ -50,10 +50,10 @@ def fused_unbalanced_across_spaces_divergence(
     with the distributions on rows and columns. We consider two cases of matrix:
 
     - (Squared) similarity matrix in Gromov-Wasserstein setting,
-    whose rows and columns represent the samples.
+      whose rows and columns represent the samples.
 
     - Arbitrary-size matrix in Co-Optimal Transport setting,
-    whose rows represent samples, and columns represent corresponding features/dimensions.
+      whose rows represent samples, and columns represent corresponding features/dimensions.
 
     More precisely, this function returns the sample and feature transport plans between
     :math:`(\mathbf{X}, \mathbf{w}_{xs}, \mathbf{w}_{xf})` and
diff --git a/ot/lp/_barycenter_solvers.py b/ot/lp/_barycenter_solvers.py
index 3e262bf94..e37fbecbc 100644
--- a/ot/lp/_barycenter_solvers.py
+++ b/ot/lp/_barycenter_solvers.py
@@ -427,16 +427,20 @@ def generalized_free_support_barycenter(
 def ot_barycenter_energy(measure_locations, measure_weights, X, a, cost_list, nx=None):
     r"""
     Computes the energy of the OT barycenter functional for a given barycenter
-    support `X` and weights `a`: .. math::
+    support `X` and weights `a`:
+
+    .. math::
         V(X, a) = \sum_{k=1}^K w_k \mathcal{T}_{c_k}(X, a, Y_k, b_k),
 
-    where: - :math:`X` (n, d) is the barycenter support, - :math:`a` (n) is the
-    barycenter weights, - :math:`Y_k` (m_k, d_k) is the k-th measure support
-      (`measure_locations[k]`),
+    where:
+
+    - :math:`X` (n, d) is the barycenter support,
+    - :math:`a` (n) is the barycenter weights,
+    - :math:`Y_k` (m_k, d_k) is the k-th measure support (`measure_locations[k]`),
     - :math:`b_k` (m_k) is the k-th measure weights (`measure_weights[k]`),
     - :math:`c_k: \mathbb{R}^{n\times d}\times\mathbb{R}^{m_k\times d_k}
-         \rightarrow \mathbb{R}_+^{n\times m_k}` is the k-th cost function
-         (which computes the pairwise cost matrix)
+      \rightarrow \mathbb{R}_+^{n\times m_k}` is the k-th cost function
+      (which computes the pairwise cost matrix)
     - :math:`\mathcal{T}_{c_k}(X, a, Y_k, b)` is the OT cost between the
       barycenter measure and the k-th measure with respect to the cost
       :math:`c_k`.
diff --git a/ot/optim.py b/ot/optim.py
index d94200cad..e6246b3c1 100644
--- a/ot/optim.py
+++ b/ot/optim.py
@@ -164,7 +164,7 @@ def generic_conditional_gradient(
     conditional gradient or generalized conditional gradient depending on the
     provided linear program solver.
 
-        The function solves the following optimization problem if set as a conditional gradient:
+    The function solves the following optimization problem if set as a conditional gradient:
 
     .. math::
         \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
@@ -172,7 +172,7 @@ def generic_conditional_gradient(
 
         s.t. \ \gamma \mathbf{1} &= \mathbf{a}
 
-             \gamma^T \mathbf{1} &= \mathbf{b} (optional constraint)
+             \gamma^T \mathbf{1} &= \mathbf{b} \ (\text{optional constraint})
 
              \gamma &\geq 0
 
@@ -184,7 +184,7 @@ def generic_conditional_gradient(
 
     The algorithm used for solving the problem is conditional gradient as discussed in :ref:`[1] <references-cg>`
 
-        The function solves the following optimization problem if set a generalized conditional gradient:
+    The function solves the following optimization problem if set a generalized conditional gradient:
 
     .. math::
         \gamma = \mathop{\arg \min}_\gamma \quad \langle \gamma, \mathbf{M} \rangle_F +
diff --git a/ot/partial/partial_solvers.py b/ot/partial/partial_solvers.py
index d1fbdca01..98a3eff26 100755
--- a/ot/partial/partial_solvers.py
+++ b/ot/partial/partial_solvers.py
@@ -270,9 +270,9 @@ def partial_wasserstein(a, b, M, m=None, nb_dummies=1, log=False, **kwargs):
     See Also
     --------
     ot.partial.partial_wasserstein_lagrange: Partial Wasserstein with
-    regularization on the marginals
+                                             regularization on the marginals
     ot.partial.entropic_partial_wasserstein: Partial Wasserstein with a
-    entropic regularization parameter
+                                             entropic regularization parameter
     """
 
     a, b, M = list_to_array(a, b, M)
@@ -586,7 +586,7 @@ def gwgrad_partial(C1, C2, T):
     as the marginals may not sum to 1.
 
     .. note:: This function will be deprecated in a near future, please use
-    `ot.gromov.gwggrad` instead.
+        `ot.gromov.gwggrad` instead.
 
     Parameters
     ----------
@@ -629,7 +629,7 @@ def gwloss_partial(C1, C2, T):
     """Compute the GW loss.
 
     .. note:: This function will be deprecated in a near future, please use
-    `ot.gromov.gwloss` instead.
+        `ot.gromov.gwloss` instead.
 
     Parameters
     ----------
@@ -700,7 +700,7 @@ def partial_gromov_wasserstein(
     :ref:`[29] <references-partial-gromov-wasserstein>`
 
     .. note:: This function will be deprecated in a near future, please use
-    `ot.gromov.partial_gromov_wasserstein` instead.
+        `ot.gromov.partial_gromov_wasserstein` instead.
 
     Parameters
     ----------
@@ -908,7 +908,7 @@ def partial_gromov_wasserstein2(
     :ref:`[29] <references-partial-gromov-wasserstein2>`
 
     .. note:: This function will be deprecated in a near future, please use
-    `ot.gromov.partial_gromov_wasserstein2` instead.
+        `ot.gromov.partial_gromov_wasserstein2` instead.
 
     Parameters
     ----------
@@ -1048,7 +1048,7 @@ def entropic_partial_gromov_wasserstein(
     partial GW in :ref:`[29] <references-entropic-partial-gromov-wasserstein>`
 
     .. note:: This function will be deprecated in a near future, please use
-    `ot.gromov.entropic_partial_gromov_wasserstein` instead.
+        `ot.gromov.entropic_partial_gromov_wasserstein` instead.
 
     Parameters
     ----------
@@ -1099,7 +1099,7 @@ def entropic_partial_gromov_wasserstein(
 
     Returns
     -------
-    :math:`gamma` : ndarray, shape (dim_a, dim_b)
+    gamma : ndarray, shape (dim_a, dim_b)
         Optimal transportation matrix for the given parameters
     log : dict
         log dictionary returned only if `log` is `True`
@@ -1219,7 +1219,7 @@ def entropic_partial_gromov_wasserstein2(
     partial GW in :ref:`[29] <references-entropic-partial-gromov-wasserstein2>`
 
     .. note:: This function will be deprecated in a near future, please use
-    `ot.gromov.entropic_partial_gromov_wasserstein2` instead.
+        `ot.gromov.entropic_partial_gromov_wasserstein2` instead.
 
     Parameters
     ----------
diff --git a/ot/sliced.py b/ot/sliced.py
index 3cf2002e7..81d0bd4a3 100644
--- a/ot/sliced.py
+++ b/ot/sliced.py
@@ -192,7 +192,7 @@ def max_sliced_wasserstein_distance(
     Computes a Monte-Carlo approximation of the max p-Sliced Wasserstein distance
 
     .. math::
-        \mathcal{Max-SWD}_p(\mu, \nu) = \underset{\theta _in
+        \mathcal{Max-SWD}_p(\mu, \nu) = \underset{\theta \in
         \mathcal{U}(\mathbb{S}^{d-1})}{\max} [\mathcal{W}_p^p(\theta_\#
         \mu, \theta_\# \nu)]^{\frac{1}{p}}
 
@@ -340,6 +340,7 @@ def projection_sphere_to_circle(
     Projection of :math:`x\in S^{d-1}` on circles using coordinates on [0,1[.
 
     To get the projection on the circle, we use the following formula:
+
     .. math::
         P^U(x) = \frac{U^Tx}{\|U^Tx\|_2}
 
diff --git a/ot/unbalanced/_lbfgs.py b/ot/unbalanced/_lbfgs.py
index ea273c7db..9591b976d 100644
--- a/ot/unbalanced/_lbfgs.py
+++ b/ot/unbalanced/_lbfgs.py
@@ -183,9 +183,9 @@ def lbfgsb_unbalanced(
     - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions
     - :math:`\mathbf{c}` is a reference distribution for the regularization
     - :math:`\mathrm{div_m}` is a divergence, either Kullback-Leibler divergence,
-    or half-squared :math:`\ell_2` divergence, or Total variation
+      or half-squared :math:`\ell_2` divergence, or Total variation
     - :math:`\mathrm{div}` is a divergence, either Kullback-Leibler divergence,
-    or half-squared :math:`\ell_2` divergence
+      or half-squared :math:`\ell_2` divergence
 
     .. note:: This function is backend-compatible and will work on arrays
         from all compatible backends. First, it converts all arrays into Numpy arrays,
@@ -372,9 +372,9 @@ def lbfgsb_unbalanced2(
     - :math:`\mathbf{a}` and :math:`\mathbf{b}` are source and target unbalanced distributions
     - :math:`\mathbf{c}` is a reference distribution for the regularization
     - :math:`\mathrm{div_m}` is a divergence, either Kullback-Leibler divergence,
-    or half-squared :math:`\ell_2` divergence, or Total variation
+      or half-squared :math:`\ell_2` divergence, or Total variation
     - :math:`\mathrm{div}` is a divergence, either Kullback-Leibler divergence,
-    or half-squared :math:`\ell_2` divergence
+      or half-squared :math:`\ell_2` divergence
 
     .. note:: This function is backend-compatible and will work on arrays
         from all compatible backends. First, it converts all arrays into Numpy arrays,
diff --git a/ot/unbalanced/_mm.py b/ot/unbalanced/_mm.py
index 0d40f909b..897375259 100644
--- a/ot/unbalanced/_mm.py
+++ b/ot/unbalanced/_mm.py
@@ -272,7 +272,7 @@ def mm_unbalanced2(
         By default, solve the unregularized problem
     c : array-like (dim_a, dim_b), optional (default = None)
         Reference measure for the regularization.
-        If None, then use :math:`\mathbf{c} = mathbf{a} mathbf{b}^T`.
+        If None, then use :math:`\mathbf{c} = \mathbf{a} \mathbf{b}^T`.
     div: string, optional
         Divergence to quantify the difference between the marginals.
         Can take two values: 'kl' (Kullback-Leibler) or 'l2' (half-squared)