%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%                                                               %%
%% This is the mc_template.tex file for the mc document class.   %%
%% It is used to prepare s manuscript for Mathematical 			 %%
%% Communications journal.                                       %%
%%                                                               %%
%% The mc.cls class works only with a pdflatex engine.           %%
%% The file newmc.cls should be placed where LaTeX 			     %%
%% can find it, e.g. in the current working directory.		     %%
%%                                                               %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\documentclass{mc} 

%%===============================================================%%
%% Please add here your own packages, macros and enviroments.    %%
%% It is not necessary to include ams* and graphicx packages     %%
%% since they are automatically included by the mc class.        %%
%% Avoid defining your own environments and use the already      %%
%% defined ones (e.g.~theorem, lemma etc.)                       %%
%%===============================================================%%

%\usepackage{enumerate}  % uncomment to use this package
%\newcommand{\E}{\mathbb{E}} % example of a macro
%\usepackage{float}      
%\usepackage{subcaption} 
\usepackage{multirow}
%\usepackage{caption}
%\usepackage{subcaption} 
\usepackage[labelformat=simple]{subcaption}  
\renewcommand\thesubfigure{(\alph{subfigure})}

\usepackage{etoolbox}
\makeatletter
\AtBeginEnvironment{figure}{\par\nolinenumbers}
\AtEndEnvironment{figure}{\linenumbers}
\AtBeginEnvironment{table}{\par\nolinenumbers}
\AtEndEnvironment{table}{\linenumbers}
\makeatother

%%===============================================================%%


%%===============================================================%%
%% Journal info will be edited by the typesetter 				 %%
%% DO NOT CHANGE THIS PART						                 %%
%%===============================================================%%
\setcounter{page}{1}
\renewcommand\thisnumber{x}
\renewcommand\thisyear {2026}
\renewcommand\thismonth{xxx}
\renewcommand\thisvolume{31}
\renewcommand\datereceived{October 14, 2025}
\renewcommand\dateaccepted{March 5, 2026}
\renewcommand\doinum{10.1000/100}
%%===============================================================%%




\begin{document}
%\linenumbers

%%===============================================================%%
%% TITLE                                                         %%
%% Please add the title with \title[Short title]{Title}          %%
%% Short title is a running head apearing in the header.         %%
%%===============================================================%%
\title[Optimal convergence rates of wavelet estimators for a hidden density in a mixture model]	% at most 50 characters including spaces
		{Optimal convergence rates of wavelet estimators for a hidden density in a mixture model} 	% at most 150 characters including spaces ()
%%===============================================================%%


%%===============================================================%%
%% AUTHOR(S)                                                     %%
%%                                                               %%
%% Add author's details in the following format. For each author %%
%% provide the affiliation, address and Orcid identifier.		 %%
%% Mark the corresponding author with \comma\corrauth.			 %%
%%===============================================================%%
\author[J.~Kou, and D.~Liang] % put here short author names for header
	{Junke Kou\corrauth\orcidnumber{0000-0001-6271-9238}
		and
	 Dan Liang\orcidnumber{0009-0002-5105-9355}
    % and %
	 %Third Author\affil2\orcidnumber{0000-0000-0000-0000} % if Second and Third authors share the same affiliation
	}		 

\address{ School of Mathematics and Computational Science, Guilin University of Electronic Technology,  Guilin-541004, China 
		} 

\emails{%
	\email{kjkou@guet.edu.cn.} (J.~Kou),
	\email{liangdan2012@163.com} (D.~Liang)
%	\email{tauthor@gmail.com} (T.~Author)
		}

%% For single author use the following format. 				     %%
%\author[F.~Author]%
%	{First Author\orcidnumber{0000-0000-0000-0000}
%	}		 
%
%\address{Affiliation 1} 
%
%\emailsingle{%
%	\email{fauthor@mathos.hr}
%		}
%%===============================================================%%


%%===============================================================%%
%% ABSTRACT                                                      %%
%%===============================================================%%
\begin{abstract}
This paper investigates nonparametric estimations of a density function in a mixture model. Firstly, a lower bound estimation under $L^{p}(1\leq p<+\infty)$ error of an arbitrary density estimator is discussed. Secondly, a linear estimator and an adaptive nonlinear estimator of the unknown density function are constructed by the wavelet method. The rates of convergence of those two wavelet estimators are discussed with some mild conditions. Combining with the lower bound estimations, two wavelet estimators can attain the optimal convergence rate. Finally, numerical examples are given to verify the performance of the two wavelet estimators.
\end{abstract}
%%===============================================================%%


%%===============================================================%%
%% KEYWORDS                                                      %%
%%===============================================================%%
\keywords{nonparametric estimation; density function; mixture model; $L^{p}$ risk}
%%===============================================================%%


%%===============================================================%%
%% AMS subject classification                                    %%
%%===============================================================%%
\ams{62G07, 62G20, 42C40}
%%===============================================================%%


%%===============================================================%%
\maketitle
%%===============================================================%%





%%===============================================================%%
%% MAIN BODY                                                     %%
%%===============================================================%%

\section{Introduction}
%%\noindent {\bf2. Preliminary }
%%\vspace{10pt}
\quad This paper considers the following mixture density model:
\begin{equation}\label{1.1}
	g(\boldsymbol{x})=\theta h(\boldsymbol{x})+(1-\theta)f(\boldsymbol{x}) , \boldsymbol{x}\in\Omega.
\end{equation}
In the above equation, $g(\boldsymbol{x})$ is the corresponding density function of independent and identically distributed (i.i.d) random vectors $ X_1, X_2, \ldots, X_n $. Other two functions $h(\boldsymbol{x})$ and $f(\boldsymbol{x})$ all are bounded density functions, and the function $h(\boldsymbol{x})$ is known.
$\Omega$ denotes a compact support subset of ${\mathbb{R}}^{d}$. The parameter $\theta$ is a known mixing ratio and $\theta\in(0,1)$. The aim of this model is to estimate the unknown density $f(\boldsymbol{x})$ by the observed data $X_{1}, X_{2},\ldots,X_{n}$. 

As a powerful statistical tool, the above mixture model plays a significant role in statistics, economics, big data processing, and other fields, see \cite{Patra, Yang, Ma, Bodha, Mostofi}. In the context of dealing with contamination problems \cite{McLachlanPeel2000}, model ({\color{blue}\ref{1.1}}) describes the case where the density function $f(\boldsymbol{x})$ of an unknown distribution is contaminated by an arbitrary distribution $h(\boldsymbol{x})$ with a proportion $\theta$, which constitutes a particular instance of the pollution model \cite{Huber}. In addition, Maiboroda and Sugakova \cite{Maiboroda}, Chen et al. \cite{Chen}, and Liu and Gao \cite{Liu} made different assumptions about the contamination model to study the problem of nonparametric density estimation.

The mixture model is also widely used in multiple testing problems, such as microarray analysis \cite{Bertolino, Maria}, neuroimaging \cite{Shu, Winkler}, and other related fields. In the model, $h(\boldsymbol{x})$ and $f(\boldsymbol{x})$ represent the densities of the observed values under the null hypothesis and the alternative hypothesis, respectively, and $\theta$ is the asymptotic proportion of the true null hypotheses. Efron et al. \cite{Efron} employed model ({\color{blue}\ref{1.1}}) to estimate the local false discovery rate (FDR), which is defined as the posterior probability derived from the mixture model. Robin et al. \cite{Robin} and Nguyen and Matias \cite{Nguyen} further proposed stochastic weighted kernel estimators for $f(\boldsymbol{x})$, with weights based on the posterior probabilities.

For a nonparametric mixture density model, the kernel function method is commonly used by many researchers, such as Marzio and Taylo \cite{Marzio}, Liu and Yu \cite{Liu2},  Zhang et al \cite{Zhang}, Chagny et al \cite{Chagny}. Nevertheless, when the function has sharp peaks and prominent parts, the corresponding kernel estimator is not particularly effective. Wavelet estimation is characterized by its multiresolution analysis and adaptivity. The wavelet transform refines various unknown functions and signals progressively across multiple scales through dilation and translation. For more details, refer to \cite{Härdle, Abramovich}. Therefore, it has achieved relatively significant accomplishments in the field of nonparametric estimation. Baldi et al. \cite{Baldi} proposed an adaptive density estimation method based on spherical wavelets (needlets), and proved that it has the theoretically optimal rate of convergence.  Wang \cite{Wang}  provided the optimal rate of convergence for density estimation in Besov space with Fano's lemma. Chesneau et al. \cite{Chesneau} proposed linear and nonlinear wavelet estimators for 2D continuous-discrete density functions, and studied integrated mean squared error of wavelet estimators in Besov balls. Chesneau et al. \cite{Chesneau2020} investigated the nonparametric estimation of regression models with additive and multiplicative noise.  Juditsky and Lambert-Lacroix \cite{Juditsky} studied the optimal rate of convergence of the $L^p(1\leq p<+\infty)$ risk using biquadrature wavelet estimation of the density function in one-dimensional Hölder space. Shen et al. \cite{Shen} studied the consistency and asymptotic normality of wavelet estimators of the regression function under the $\varphi$-mixed case. Cao and  Zeng \cite{Cao} proposed a data-driven wavelet estimation method for estimating the derivatives of the density function. 

The main contributions of this paper are as follows. We consider nonparametric optimal estimations of a density function in a mixture model. Firstly, a lower bound estimation over $L^p(1\leq p<+\infty)$ risk of an arbitrary possible density estimator of the unknown density function $f(\boldsymbol{x})$ is proved. Secondly, a linear density estimator is constructed by using the wavelet method. A convergence rate of this linear estimator is discussed with $f(\boldsymbol{x})\in B_{\eta,q}^s({{\Omega}})$ and some other mild conditions. Then, we can easily see that this linear wavelet can attain the optimal convergence rate when $p<\eta$. Furthermore, in order to overcome the shortage of this nonadaptive linear estimator, a nonlinear wavelet-based estimator is proposed by using the hard thresholding algorithm. According to the lower bound estimations and the corresponding theory results of this nonlinear wavelet-based estimator, this nonlinear estimator can get the optimal convergence rates  up to an $\ln n$ factor in the cases of $p<\eta$ or $p\geq\eta$. Finally, numerical experiments indicate that both wavelet-based estimators can effectively estimate the unknown density function $f(\boldsymbol{x})$. 

This paper is structured as follows. Wavelet theory and Besov space are introduced in Section \ref{S2}. A lower bound estimation of any possible density estimator of the unknown density function in Besov spaces $ B_{\eta,q}^s({{\Omega}})$ is proved in Section \ref{S3}. The upper bound estimations over $L^p(1\leq p<+\infty)$ risk of two wavelet-based density estimators are discussed in Section \ref{S4}. The numerical simulation studies are shown in Section~\ref{S5}.



\section{Wavelet and Besov space}\label{S2}
\quad The aim of this paper is to study wavelet estimation of the Besov spaces density function based on a mixture model. We begin with the notion of multiresolution analysis (MRA) by assuming that \(\{V_j\}_{j \in \mathbb{Z}}\) is a sequence of closed subspaces of the space $L^{2}(\mathbb{R}^{d})$, and that for any $j\in\mathbb{Z}$ , it holds that $V_{j}\subset V_{j+1}$ if the following conditions hold:

(1) $\bigcap V_{j}=\{0\}$ and $\overline{\bigcup V_{j}}=L^{2}(\mathbb{R}^{d});$

(2) $f(\boldsymbol{x})\in V_{0}$ if and only if $f(2^{j}\boldsymbol{x})\in V_{j};$

(3) There is a function $\Phi(\boldsymbol{x})\in V_{0}$ such that $\{\Phi(\boldsymbol{x}-\boldsymbol{k}), \boldsymbol{k}\in \mathbb{Z}^{d}\}$ is an orthonormal basis in $V_{0}$.

In the present study, we adopt the compactly supported scalar function $\Phi(\boldsymbol{x})$ from the family of Daubechies. For any $ u\in \{0, 1, \ldots, 2^{d}-1\}$, we introduce the associated compactly supported wavelet function $\Psi_u(\boldsymbol{x})$. Furthermore, for a positive integer $\tau$ and $\Lambda_{j}=\{0, 1, \ldots, 2^{j}-1\}^d$,
it is obvious that  $S=\{{\Phi _{\tau,\textit{\textbf{k}}}}:=2^{\tau d/2}\Phi(2^{\tau}\boldsymbol{x}-\boldsymbol{k}), {\Psi _{j,\textit{\textbf{k}},u}}:=2^{jd/2}\Psi_u(2^{j}\boldsymbol{x}-\boldsymbol{k}), j\geq\tau, \textit{\textbf{k}}\in 
\Lambda_{j}\}$ constitutes a standard orthogonal basis for  ${L^2}{(\Omega)}$. For any positive integer $j_*\geq\tau$, the function $F(\boldsymbol{x})\in {L^2}{(\Omega)}$ and can be expanded into the following wavelet level using wavelet basis 
$S$ expanded into the following wavelet series:
\begin{equation}\label{2.1}
	F(\boldsymbol{x}) = \sum\limits_{\textit{\textbf{k}} \in {\Lambda _{{j_*}}}} {{\alpha _{{j_*},\textit{\textbf{k}}}}} {\Phi _{{j_*},\textit{\textbf{k}}}}(\boldsymbol{x}) + \sum\limits_{j = {j_*}}^\infty   \sum\limits_{u = {1}}^{{2^d-1}}{\sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} {{\beta _{j,\textit{\textbf{k}},u}} } } {\Psi _{j,\textit{\textbf{k}},u}}(\boldsymbol{x}),\boldsymbol{x} \in {\Omega}.
\end{equation}
In this equation, ${\alpha _{j_*,\textit{\textbf{k}}}} = \int_{\Omega}F(\boldsymbol{x})\Phi_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})d\boldsymbol{x}$ and ${\beta _{j,\textit{\textbf{k}},u}} = \int_{\Omega}F(\boldsymbol{x})\Psi_{j,\textit{\textbf{k}},u}(\boldsymbol{x})d\boldsymbol{x}$. 

Let ${\rm P}_{V_{j}}$ denote the orthogonal projection operator. This operator maps functions from the space ${L^2}{(\Omega)}$ onto the subspace $V_{j}$.
Then, for any $F(\boldsymbol{x})\in {L^2}{(\Omega)}$,
\begin{gather*}
	{\rm P}_{V_{j}}F(\boldsymbol{x})=\sum_{\textit{\textbf{k}}\in\Lambda_{j}}\alpha_{j,\textit{\textbf{k}}}\Phi_{j,\textit{\textbf{k}}}(\boldsymbol{x}), \boldsymbol{x}\in \Omega.
\end{gather*}

\begin{lemma}\label{y2.2}
	Assume that a scaling function $\Phi(\boldsymbol{x})$ is m-regular, meaning that
	$\Phi(\boldsymbol{x})\in{{C}^{m}}$ and \linebreak $| {{D^a}\Phi(\boldsymbol{x})}|\le c{({1 + {{|\boldsymbol{x}|}^2}})^{-l}}$ for each $l\in{\mathbb{N}}$ and $a=0, 1, \ldots, m$. For any sequence $\left\lbrace a_{\textit{\textbf{k}}}\right\rbrace  \in{l_p} $ (i.e.,  $\|(a_{\textit{\textbf{k}}})\|_p:=\left(\sum_k |a_{\textit{\textbf{k}}}|^p \right)^{\frac{1}{p}}<\infty)$, when $ 1\le p \le \infty $, there exist constants $ 0 <c_{1} \le c_{2}$ satisfying:
	\begin{gather*}
		c_{1}2^{j(\frac{d}{2}-\frac{d}{p})} \left\|(a_{\textit{\textbf{k}}})\right\|_{p} \le\left\|\sum\limits_{\textit{\textbf{k}} \in {\mit\Lambda _j}} a_{\textit{\textbf{k}}} 2^{\frac{jd}{2}} \Phi(2^{j}\boldsymbol{x}-\boldsymbol{k}) \right\|_{p} \le  	c_{2}2^{j(\frac{d}{2}-\frac{d}{p})} \left\|(a_{\textit{\textbf{k}}} )\right\|_{p}.
	\end{gather*}
\end{lemma}

It is well known that Besov spaces are very important function spaces that can be characterized by wavelet bases. The equivalent definition of Besov spaces given in terms of wavelet coefficients is given below \cite{Härdle}.
\begin{lemma}\label{y}
	Suppose the scale function $\Phi(\boldsymbol{x})$ is regular of order $m$, $0<s<m$; let $F(\boldsymbol{x})\in {L^p}{(\Omega)}$, $1\leq p,q\leq \infty$. Then the following inferences are equivalent:\\
	$\hspace*{3mm}(1)\hspace*{3mm} F(\boldsymbol{x})\in B_{p,q}^s({{\Omega}});$\\
	$\hspace*{3mm}(2)\hspace*{3mm} \{{{2^{js}}{{\|{{\rm P}_{V_{j+1}}F(\boldsymbol{x})-{\rm P}_{V_{j}}F(\boldsymbol{x})}\|}_p}}\}\in{l_q};$\\
	$\hspace*{3mm}(3)\hspace*{3mm} \{{{2^{j({s-\frac{d}{p} + \frac{d}{2}})}}{{\|{{\beta _{j,\textit{\textbf{k}},u}}}\|}_p}}\}\in{l_q}$.\\
	The Besov norm of $F(\boldsymbol{x})$ is defined as
	\[{\left\| F(\boldsymbol{x}) \right\|_{B_{p,q}^s}} := {\left\| {\left( {{\alpha _{{\tau},\textit{\textbf{k}}}}} \right)} \right\|_p} + {\left\| {{{( {{2^{j({s-\frac{d}{p} + \frac{d}{2}})}}{{\|{{\beta _{j,\textit{\textbf{k}},u}}}\|}_p}})}_{j \ge {\tau}}}} \right\|_q},\]
	where $\|(a_{\tau, \textit{\textbf{k}}})\|_p:=\left(\sum_{\textit{\textbf{k}} \in {\Lambda _j}} |a_{\tau, \textit{\textbf{k}}}|^p \right)^{\frac{1}{p}}$ and $\left\| {{\beta _{j,\textit{\textbf{k}},u}}} \right\|_p^p =\sum\limits_{u = 1}^{{2^d} - 1}{\sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}} {{{\left|{{\beta _{j,\textit{\textbf{k}},u}}}\right|}^p}}$.
\end{lemma}



\section{Lower bound estimation}\label{S3}
\quad In this section, we will discuss the lower bound estimations of the density function in the mixture model. We firstly introduce the Kullback-Leibler distance \cite{Tsybakov2008}, which plays a key role in the following discussions. Let $(\Omega,\mathscr{F},P_k)$ be a measurable space, and $U$ and $W$ are two probability measures defined on it. If $U$ is absolutely continuous with respect to $W$ (denoted as $U \ll W$), the corresponding Kullback-Leibler distance is defined as
\begin{align*}
	\mathbb{K}\left(U,W\right):=\int u(x)\ln\frac{u(x)}{w(x)}\mathrm{d}x.
\end{align*}
Here, $u$ and $w$ represent the probability density functions of $U$ and $W$, respectively.

Based on the Kullback-Leibler distance, we can derive the following lemma. In the studies described below, $a\lesssim b$ means that $a\leq cb$ with a positive constant $c$, which is independent of $a$ and $b$. $a\gtrsim b$ means that $b\lesssim a$. In addition, $a\succ b$ means $a\geq cb$, $a\sim b$ means that $a\lesssim b$ and $b\lesssim a$ hold.
\begin{lemma}\label{K0}
	For the estimation model ({\color{blue}\ref{1.1}}), $g_1$ and $g_2$ are two measurable functions defined on $\Omega$. $ X_1, X_2, \ldots, X_n $ is a set of i.i.d. random vectors. Then, we have
	\begin{align*}
		\mathbb{K}\left(P_{g_1}^n,P_{g_2}^n\right)\lesssim n\big\|f_1-f_2\big\|_2.
	\end{align*}
\end{lemma}
\begin{proof}
	According to model ({\color{blue}\ref{1.1}}), $g_1(\boldsymbol{x})=\theta h(\boldsymbol{x})+(1-\theta)f_1(\boldsymbol{x})$ and $g_2(\boldsymbol{x})=\theta h(\boldsymbol{x})+(1-\theta)f_2(\boldsymbol{x})$. Due to the fact that $ X_1, X_2, \ldots, X_n $ are independent and identically distributed random vectors, we can easily get the corresponding joint density function:
	\begin{align*}
		g_1^n(\boldsymbol{x})=\prod_{i=1}^ng_1(\boldsymbol{x}_i), g_2^n(\boldsymbol{x})=\prod_{i=1}^ng_2(\boldsymbol{x}_i).
	\end{align*}
	Due to the definition of the Kullback-Leibler distance,
	\begin{equation}
		\begin{split}
		\mathbb{K}\left(P_{g_1}^n,P_{g_2}^n\right)&=\int g_1^n({\boldsymbol{x}})\ln\frac{g_1^n({\boldsymbol{x}})}{g_2^n({\boldsymbol{x}})}\mathrm{d}{\boldsymbol{x}}\\
		&=\int_{\Omega} \prod_{j=1}^ng_1(\boldsymbol{x}_j)\ln\frac{\prod_{i=1}^ng_1(\boldsymbol{x}_i)}{\prod_{i=1}^ng_2(\boldsymbol{x}_i)}\mathrm{d}\boldsymbol{x}\\
		&=\sum_{i=1}^n\int_{\Omega}  g_1(\boldsymbol{x}_i)\ln\frac{g_1(\boldsymbol{x}_i)}{g_2(\boldsymbol{x}_i)}\mathrm{d}\boldsymbol{x}_i\prod_{j\neq i}^n\int_{\Omega}  g_1(\boldsymbol{x}_j){d}\boldsymbol{x}_j	\\
		&=\sum_{i=1}^n\int_{\Omega}  g_1(\boldsymbol{x}_i)\ln\frac{g_1(\boldsymbol{x}_i)}{g_2(\boldsymbol{x}_i)}\mathrm{d}\boldsymbol{x}_i\\
		&=n\int_{\Omega}  g_1(\boldsymbol{x}_1)\ln\frac{g_1(\boldsymbol{x}_1)}{g_2(\boldsymbol{x}_1)}\mathrm{d}\boldsymbol{x}_1\\
		&=n	\mathbb{K}\left(P_{g_1},P_{g_2}\right).
		\end{split}
		\label{j31}
	\end{equation}
	
	Next, we derive the upper bound of $\mathbb{K}\left(P_{g_1},P_{g_2}\right)$. Note that $\ln(Q+1)\leq |Q|$ holds for all $Q > -1$.	 In addition,	
	\begin{align*}
		\frac{g_1(\boldsymbol{x})}{g_2(\boldsymbol{x})}=\frac{\theta h(\boldsymbol{x})+(1-\theta)f_1(\boldsymbol{x})}{\theta h(\boldsymbol{x})+(1-\theta)f_2(\boldsymbol{x})}=1+\frac{(1-\theta) [f_1(\boldsymbol{x})-f_2(\boldsymbol{x})]}{\theta h(\boldsymbol{x})+(1-\theta)f_2(\boldsymbol{x})}:=1+Q(\boldsymbol{x}).
	\end{align*}
	Hence,
	\begin{align*}\label{j32}
		\mathbb{K}\left(P_{g_1},P_{g_2}\right)&=\int _{\Omega} g_1(\boldsymbol{x})\ln\frac{g_1(\boldsymbol{x})}{g_2(\boldsymbol{x})}\mathrm{d}\boldsymbol{x}\notag\\
		&\leq \int_{\Omega} g_1(\boldsymbol{x})|Q(\boldsymbol{x})|\mathrm{d}\boldsymbol{x}\notag\\
		&=(1-\theta)\int_{\Omega} g_1(\boldsymbol{x})\frac{ |f_1(\boldsymbol{x})-f_2(\boldsymbol{x})|}{g_2(\boldsymbol{x})}\mathrm{d}\boldsymbol{x} .
	\end{align*}
	Due to $g_1(\boldsymbol{x})=\theta h(\boldsymbol{x})+(1-\theta)f_1(\boldsymbol{x})$, we can get
	\begin{equation}
		\begin{split}
		\mathbb{K}\left(P_{g_1},P_{g_2}\right) 
		&\leq \theta (1-\theta)\int_{\Omega} \frac{ h(\boldsymbol{x})|f_1(\boldsymbol{x})-f_2(\boldsymbol{x})|}{g_2(\boldsymbol{x})}\mathrm{d}\boldsymbol{x}+(1-\theta)^2\int _{\Omega} \frac{ f_1(\boldsymbol{x})|f_1(\boldsymbol{x})-f_2(\boldsymbol{x})|}{g_2(\boldsymbol{x})}\mathrm{d}\boldsymbol{x}\\
		&:=A_1+A_2 .
		\end{split}
		\label{j33}
	\end{equation}
	For $A_1$, the regularity of the density function shows that $g_2(\boldsymbol{x})\geq \theta h(\boldsymbol{x})$, and $\frac{h(\boldsymbol{x})}{g_2(\boldsymbol{x})}\leq \frac{1}{\theta }$. Therefore,
	\begin{equation}\label{A1}
		A_1\leq (1-\theta)\int_{\Omega}|f_1(\boldsymbol{x})-f_2(\boldsymbol{x})|\mathrm{d}\boldsymbol{x}.
	\end{equation}
	For $A_2$, it follows from the properties of the density function that $g_2(\boldsymbol{x})>(1-\theta) f_2(\boldsymbol{x})$. The boundness property of $f(\boldsymbol{x})$ implies that $\frac{f_1(\boldsymbol{x})}{f_2(\boldsymbol{x})}\leq C_1$. Furthermore, $\frac{f_1(\boldsymbol{x})}{g_2(\boldsymbol{x})}\leq \frac{f_1(\boldsymbol{x})}{(1-\theta)f_2(\boldsymbol{x})}\leq \frac{C_1}{1-\theta}$ and
	\begin{equation}\label{A2}
		A_2\leq C_1(1-\theta)\int_{\Omega}|f_1(\boldsymbol{x})-f_2(\boldsymbol{x})|\mathrm{d}\boldsymbol{x}.
	\end{equation}
	Combining ({\color{blue}\ref{j33}}), ({\color{blue}\ref{A1}}), and ({\color{blue}\ref{A2}}), we know that
	\begin{equation}\label{j36} \mathbb{K}\left(P_{g_1},P_{g_2}\right)\leq(1+C_1)(1-\theta)\int_{\Omega}|f_1(\boldsymbol{x})-f_2(\boldsymbol{x})|\mathrm{d}\boldsymbol{x}.
	\end{equation}
	By using the Cauchy-Schwarz inequality, we can get
	\begin{align*}
		\int_{\Omega}|f_1(\boldsymbol{x})-f_2(\boldsymbol{x})|\mathrm{d}\boldsymbol{x}\leq \left(\int_{\Omega}|f_1(\boldsymbol{x})-f_2(\boldsymbol{x})|^2\mathrm{d}\boldsymbol{x}\right)^{\frac{1}{2}}\left(\int_{\Omega}1\mathrm{d}\boldsymbol{x}\right)^{\frac{1}{2}}\lesssim \big\|f_1-f_2\big\|_2.
	\end{align*}
	This, together with ({\color{blue}\ref{j36}}) and ({\color{blue}\ref{j31}}), shows that
	\begin{align*}
		\mathbb{K}\left(P_{g_1}^n,P_{g_2}^n\right)\lesssim n\big\|f_1-f_2\big\|_2.
	\end{align*}		
\end{proof}
{\textbf{\hspace{-0.5cm}Fano's lemma}} \cite{Tsybakov2008} \emph{		
	Let $(\Omega,\mathscr{F},P_k)$ be a probability measure space and $G_k\in \mathscr{F}(k=0,1,...,D)$ a sequence of measurable sets on it. If $G_k\cap G_l=\varnothing$ holds with $k\neq l$, then
	\begin{equation*}
		\sup_{0\leq k\leq D }P_{{k}}^n\left(G_{k}^{c}\right)\geq\min\left\{\sqrt{D}\exp\left(-K_{D}\right), \frac{1}{2}\right\},
	\end{equation*}
	where $G_{k}^{c}$ denotes the complement set of $G_k$ and $	K_{D}:=\inf_{0\leq l\leq D}\frac{1}{D}\sum_{k\neq l}\mathbb{K}\left(P_{g_{k}},P_{g_{l}}\right)$.}

In this position, the lower bound estimation of any possible density estimator will be stated. Throughout this paper, we assume the density function $f (\boldsymbol{x})\!\in\! B_{\eta,q}^s({{\Omega},T})$ with $B_{\eta,q}^{s}(\Omega,T)\!\!:=\!\! \left\{f \!\in\! B_{\eta,q}^s({{\Omega}}), \|f\|_{B_{\eta,q}^s}\!\leq T \right\}$.
\begin{theorem}\label{MT}
	For the estimation model ({\color{blue}\ref{1.1}}), $f (\boldsymbol{x})\in B_{\eta,q}^s({{\Omega},T})$ with $0\leq \eta,q<+\infty$, $p\in [1,+\infty)$ and $s>\frac{d}{\eta}$. Suppose the region satisfies $\Omega=[0,b]^d$. If $\hat{f_n}(\boldsymbol{x})$ is any possible density estimator of the density function $f(\boldsymbol{x})$, we have
	\begin{equation*}
		\sup_{f(\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega,T)}
		{\rm{E}}\Big[\big\| \hat{f}_{n}(\boldsymbol{x})-f(\boldsymbol{x})\big\|_{p}\Big]\gtrsim \max\left\{n^{-\frac{s}{2s+d}},\left(\frac{\ln n}{n}\right)^\frac{s-\frac{d}{\eta}+\frac{d}{p}}{2(s-\frac{d}{\eta})+d}\right\}.
	\end{equation*}
\end{theorem}
\begin{remark}
	According this theorem, note that
	\begin{align*}
		\max\left\{n^{-\frac{s}{2s+d}},\left(\frac{\ln n}{n}\right)^\frac{s-\frac{d}{\eta}+\frac{d}{p}}{2(s-\frac{d}{\eta})+d}\right\} =
		\begin{cases}
			n^{-\frac{s}{2s+d}},
			& \eta>\frac{pd}{2s+d}, \\
			\left(\frac{\ln n}{n}\right)^\frac{s-\frac{d}{\eta}+\frac{d}{p}}{2(s-\frac{d}{\eta})+d},
			& \eta\leq\frac{pd}{2s+d}.
		\end{cases}
	\end{align*}
	This conclusion means that the lower bound estimations have different results under distinct cases.
\end{remark}



\begin{proof}
	In order to prove the above theorem, we divide a theoretical result into two parts as follows:
	\[\sup_{f(\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega,T)}
	{\rm{E}}\Big[\big\| \hat{f}_{n}(\boldsymbol{x})-f(\boldsymbol{x})\big\|_{p}\Big]\gtrsim n^{-\frac{s}{2s+d}}\]
	and
	\[\sup_{f(\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega,T)}
	{\rm{E}}\Big[\big\| \hat{f}_{n}(\boldsymbol{x})-f(\boldsymbol{x})\big\|_{p}\Big]\gtrsim \left(\frac{\ln n}{n}\right)^\frac{s-\frac{d}{\eta}+\frac{d}{p}}{2(s-\frac{d}{\eta})+d}.\]
	
	\textbf{The proof of the first part.}  In order to prove the first part, we need to construct $f_{\lambda^i} (\boldsymbol{x})(i=0,1,\dots,D)$ such that $f_{\lambda^i} (\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega,T)$ and 
	\begin{align}\label{j37}
		\sup_{i}
		{\rm{E}}\Big[\big\| \hat{f}_{n} (\boldsymbol{x})-f_{\lambda^i} (\boldsymbol{x})\big\|_{p}\Big]\gtrsim n^{-\frac{s}{2s+d}},
	\end{align}
	with $\hat{f}_{n} (\boldsymbol{x})$ being any possible estimator of the density function $f (\boldsymbol{x})$.
	
	Let the orthogonal scale function $\Phi$ have compact support and satisfy the $m$-order $(m>s)$ regularity condition; the corresponding wavelet function is denoted by $\Psi_u$ and $\rm{supp}\Psi_u \subseteq [0,b]^d=\Omega$. Assuming that there exists a compactly supported density function $y(\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega,T)$, $\min\limits_{\boldsymbol{x}\in \Omega}  y(\boldsymbol{x})=c_0>0$ and $\int _{\Omega}y(\boldsymbol{x})\mathrm{d}\boldsymbol{x}=1$.
	
	Now, we define $\delta_j:=2^{-j(s+\frac{d}{2})}$ and $f_{\lambda}(\boldsymbol{x}):=y(\boldsymbol{x})+\delta_j\sum_{k\in\Lambda_{j}}\lambda_k\Psi_{j,k,u}(\boldsymbol{x})$. In those definitions, $\lambda=(\lambda_k)_{k\in\Lambda_{j}}\in \{0,1\}^{2^{jd}}$ and $\Lambda_{j}:=\{ 0, b, 2b, ... , (2^j-1)b \}^d$. Then we can easily get 
	\begin{align*}
		\int _{\Omega}f_{\lambda}(\boldsymbol{x})\mathrm{d}\boldsymbol{x}&=\int _{\Omega}\big[{y(\boldsymbol{x})+\delta_j\sum_{k\in\Lambda_{j}}\lambda_k\Psi_{j,k,u}(\boldsymbol{x})}\big]\mathrm{d}\boldsymbol{x}\\
		&=\int _{\Omega}y(\boldsymbol{x})\mathrm{d}\boldsymbol{x}+\delta_j\sum_{k\in\Lambda_{j}}\lambda_k\int _{\Omega}{\Psi_{j,k,u}(\boldsymbol{x})}\mathrm{d}\boldsymbol{x}\\
		&=1+\delta_j\sum_{k\in\Lambda_{j}}\lambda_k\cdot 0 =1.
	\end{align*}
	Moreover, we can choose sufficiently large $j$ such that 
	\begin{align*}
		f_{\lambda}(\boldsymbol{x})\geq c_0 -\delta_j 2^{-js}\big\|\Psi_u  \big\|_{\infty}\geq 0.
	\end{align*}
	Hence, by the above discussions, the functions $f_{\lambda}(\boldsymbol{x})$ are density functions.
	
	On the other hand, due to  $\lambda_k\in \{0,1\}$ and  Lemma \ref{y}, we can obtain
	\begin{align*}
		\sum_{k\in\Lambda_{j}}|\lambda_k|^{\eta}\leq2^{jd}, 2^{j(s+\frac{d}{2}-\frac{d}{\eta})}\delta_j\left(\sum_{k\in\Lambda_{j}}|\lambda_k|^{\eta}\right)^{\frac{1}{\eta}}\leq1.
	\end{align*}
	Hence, $\left\|\delta_j\sum_{k\in\Lambda_{j}}\lambda_k\Psi_{j,k,u}\right\|_{B_{\eta,q}^{s}}\lesssim 1
	$ and  $f_{\lambda}(\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega,T)$. Then, a series of density functions $f_{\lambda}(\boldsymbol{x})$ has been constructed, which belong to Besov spaces $B_{\eta,q}^{s}(\Omega,T)$. Next, we prove that those density functions $f_{\lambda}(\boldsymbol{x})$ satisfy the result ({\color{blue}\ref{j37}}) with any possible density estimators. The Varshamov-Gilbert Lemma will be used in later discussions. Now, we introduce it as follows.
	
	{\textbf{Varshamov-Gilbert Lemma}}~\cite{Donoho, Tsybakov2008}~~
	For the set $(\lambda_k)_{k\in\Lambda_{j}}\!\!\in\! \{0,1\}^{2^{jd}}\!$, there exist a subset $\{\lambda^{0}\!, \lambda^{1}\!, ...  , \lambda^{D}\}$ and an element $\lambda^{0} = (0, 0, ..., 0)$ such that $D \geq e^{2^{jd-3}}$ and 
	\begin{equation*}\label{VG}
		\sum_{k\in\Lambda_{j}}\left|\lambda_k^\omega -\lambda_k^ \mu \right| \geq 2^{jd-3} (0\leq \omega \neq \mu \leq D).
	\end{equation*}
	According to the Varshamov-Gilbert Lemma, when $0\leq \omega \neq \mu \leq D$, we have
	\begin{equation}
		\begin{split}
		\big\|f_{\lambda^\omega}(\boldsymbol{x}) -f_{\lambda^ \mu}(\boldsymbol{x})\big\|_p^{p}&=\left\|\delta_j\sum_{k\in\Lambda_{j}}(\lambda_k^\omega-\lambda_k^\mu)\Psi_{j,k,u}(\boldsymbol{x})\right\|_p^{p}\\
		&=\int_{\Omega}\big|\delta_j\sum_{k\in\Lambda_{j}}(\lambda_k^\omega-\lambda_k^\mu)\Psi_{j,k,u}(\boldsymbol{x})\big|^{p}\mathrm{d}\boldsymbol{x}\\
		&=\delta_j^p\sum_{k\in\Lambda_{j}}|\lambda_k^\omega-\lambda_k^\mu|^p\int_{\Omega}\big|\Psi_{j,k,u}(\boldsymbol{x})\big|^{p}\mathrm{d}\boldsymbol{x}\\
		&=\delta_j^p\sum_{k\in\Lambda_{j}}|\lambda_k^\omega-\lambda_k^\mu|^p\big\|\Psi_{j,k,u}(\boldsymbol{x})\big\|_p^{p}\\ 				&=2^{-j(sp+d)}\big\|\Psi_u\big\|_p^{p}\sum_{k\in\Lambda_{j}}|\lambda_k^\omega-\lambda_k^\mu|^p\geq \frac{1}{8}2^{-jsp}\|\Psi_u\|_p^{p}.
		\end{split}
		\label{P2}
	\end{equation}
	We rewrite the above result as follows:
	\begin{equation*}
		\big\|f_{\lambda^\omega}(\boldsymbol{x}) -f_{\lambda^ \mu}(\boldsymbol{x})\big\|_p \geq 8^{-\frac{1}{p}}2^{-js}\|\Psi_u\|_p:=\sigma_j.
	\end{equation*}
	For an arbitrary density estimator $\hat{f}_{n}(\boldsymbol{x})$, we define $G_{\lambda^\mu}:=\{\|\hat{f}_{n}(\boldsymbol{x})-f_{\lambda^\mu} (\boldsymbol{x})\|_p< \frac{\sigma_j}{2}\}(\mu=1,2,...,D)$. Then, $G_{\lambda^\omega}\cap G_{\lambda^\mu}=\varnothing$ for any $\omega\neq \mu$. Furthermore, according to Fano's lemma, we can obtain
	\begin{equation}\label{BD}
		\sup_{0\leq\mu\leq D }P_{g_{\lambda^\mu}}^n\left(G_{\lambda^\mu}^{c}\right)\geq\min\left\{\sqrt{D}\exp\left(-K_{D}\right), \frac{1}{2}\right\},
	\end{equation}
	where their common distribution is characterized by probability measure $P_{g}^n$ with corresponding joint density function $g^n(\boldsymbol{x}):=\prod_{i=1}^ng(\boldsymbol{x}_i)$.
	Using the arguments of ({\color{blue}\ref{P2}}), 
	\begin{align*}
		\big\|f_{\lambda^\omega}(\boldsymbol{x}) -f_{\lambda^ \mu}(\boldsymbol{x})\big\|_2^{2}=\delta_j^2\sum_{k\in\Lambda_{j}}|\lambda_k^\omega-\lambda_k^\mu|^2\big\|\Psi_{j,k,u}\big\|_2^{2}\leq \delta_j^22^{jd}.
	\end{align*}
	Furthermore, by the definition of $K_{D}$ and Lemma \ref{K0},
	\begin{align*}
		K_{D}&=\inf_{0\leq\mu\leq D}\frac{1}{D}\sum_{\omega\neq \mu}\mathbb{K}\left(P_{g_{\lambda^\omega}}^n,P_{g_{\lambda^\mu}}^n\right)\leq n\delta_j^22^{jd}.
	\end{align*}
	Taking $2^{j}	\sim n^{\frac{1}{2s+d}}$, we can easily get $n\delta_j^2\sim n\cdot n^{\frac{-2(s+\frac{d}{2})}{2s+d}}\sim 1$. By the Varshamov-Gilbert Lemma, there exists a sufficiently small positive constant $c_1$ such that 
	\begin{align*}
		\sqrt{D}e^{-K_{D}}\geq e^{2^{jd-3}}e^{-n\delta_j^22^{jd}}\geq \sqrt{D}e^{-K_{D}}\geq e^{2^{jd-3}}e^{-c_12^{jd}} \geq 1.
	\end{align*}
	According to $({\color{blue}\ref{BD}})$, we have $\sup_{0\leq\mu\leq D }P_{g_{\lambda^\mu}}^n\left(G_{\lambda^\mu}^{c}\right)\gtrsim1$. Hence, by using Markov's inequality, 
	\begin{align*}
		\sup_{0\leq\mu\leq D }
		{\rm{E}}\Big[\big\| \hat{f}_{n}(\boldsymbol{x}) -f_{\lambda^\mu}(\boldsymbol{x}) \big\|_{p}\Big] &\geq  \sup_{0\leq\mu\leq D }\frac{\sigma_j}{2}P_{g_{\lambda^\mu}}^n\left(\big\| \hat{f}_{n}-f_{\lambda^\mu }\big\|_{p}\geq\frac{\sigma_j}{2}\right)\\
		&\gtrsim \sigma_j\sim n^{-\frac{s}{2s+d}},
	\end{align*}
	which is the desired conclusion.
	
	\textbf{The proof of the second part.} For the second part, it is necessary to construct a series of density functions $f_k(\boldsymbol{x})$ such that $f_{k}(\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega,T)$ and	
	\begin{align*}
		\sup_{k}
		{\rm{E}}\big[\big\| \hat{f}_{n}(\boldsymbol{x})-f_{k}(\boldsymbol{x})\big\|_{p}\big]\gtrsim \left(\frac{\ln n}{n}\right)^\frac{s-\frac{d}{\eta}+\frac{d}{p}}{2(s-\frac{d}{\eta})+d},
	\end{align*}
	with $\hat{f}_{n}(\boldsymbol{x})$ being any possible estimator of $f(\boldsymbol{x})$.
	
	Now, we define $\delta_j:=2^{-j(s-\frac{d}{\eta}+\frac{d}{2})}$ and $f_{k}(\boldsymbol{x}):=y(\boldsymbol{x})+\delta_j\Psi_{j,k,u}(\boldsymbol{x})$ with $k\in\Lambda_{j}$. Then, we need to prove that functions $f_{k}(\boldsymbol{x})$ are density functions and $f_{k}(\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega,T)$ in the discussions that follow.
	
	Note that
	\begin{align*}%\label{j311}
		\int _{\Omega}f_{k}(\boldsymbol{x})\mathrm{d}\boldsymbol{x}=\int _{\Omega}\Big[{y(\boldsymbol{x})+\delta_j\Psi_{j,k,u}(\boldsymbol{x})}\Big]\mathrm{d}\boldsymbol{x}=1.
	\end{align*}
	Due to  $\min_{\boldsymbol{x}\in \Omega}  y(\boldsymbol{x})=c_0>0$ and $\|\delta_j\Psi_{j,k,u}(x)\|_{\infty}=2^{-j(s-\frac{d}{\eta})}\|\Psi_u\|_{\infty}$, we can choose a large parameter $j$ such that
	\begin{align*}%\label{j312}
		f_{k}(\boldsymbol{x})\geq c_0 -\delta_j 2^{-j(s-\frac{d}{\eta})}\big\|\Psi_u  \big\|_{\infty}\geq 0.
	\end{align*}
	Obviously, $\delta_j 2^{j(s+\frac{d}{2}-\frac{d}{\eta})}=1$ and $\left\|\delta_j\Psi_{j,k,u}\right\|_{B_{\eta,q}^{s}}\sim 
	\delta_j 2^{j(s+\frac{d}{2}-\frac{d}{\eta})}\|\Psi_{j,k,u}\|_\eta\sim 1$. This, together with the condition $y(\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega,T)$, shows that $f_k(\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega,T)$. 
	
	For any $k, k'\!\in\!\Lambda_j$, and $k \!\neq\! k'$, we can obtain $\operatorname{supp} \Psi_{j,k,u}\cap \operatorname{supp} \Psi_{j,k',u} \!=\! \varnothing$ and $\operatorname{supp} f_{k}\cap \operatorname{supp} f_{k'}\!=\!\varnothing$. Furthermore, 
	\begin{equation*}
		\big\|f_{k}(\boldsymbol{x}) -f_{k'}(\boldsymbol{x})\big\|_p=\left\|\delta_j\Psi_{j,k,u}-\delta_j\Psi_{j,k',u}\right\|_p= 2^{-j(s+\frac{d}{p}-\frac{d}{\eta})}\|\Psi_u\|_p:=\sigma_j.
	\end{equation*}
	In addition, take $G_{k}:=\{\|\hat{f}_{n}(\boldsymbol{x}) -f_{k}(\boldsymbol{x})  \|_p< \frac{\sigma_j}{2}\}$. Then,  $G_{k}\cap G_{k'}=\varnothing$ holds when $k \neq k'$.
	Using Fano's lemma, one has
	\begin{equation}\label{BD1}	
		\sup_{k\in\Lambda_j }P_{g_{k}}^n\left(G_{k}^{c}\right)\geq\min\left\{\sqrt{2^{jd}}\exp\left(-K_{2^{jd}}\right), \frac{1}{2}\right\},
	\end{equation}
	where their common distribution is characterized by probability measure $P_{g_k}^n$ with corresponding joint density function $g^n(\boldsymbol{x}):=\prod_{i=1}^ng(\boldsymbol{x}_i)$.
	
	By Markov's inequality, we get
	\begin{equation}\label{BD2}
		{\rm{E}}\big[\big\| \hat{f}_{n}(\boldsymbol{x}) -f_{k}(\boldsymbol{x}) \big\|_{p}\big] \geq  \frac{\sigma_j}{2}P_{g_k}^n\left(\big\| \hat{f}_{n}-f_{k}\big\|_{p}\geq\frac{\sigma_j}{2}\right)=\frac{\sigma_j}{2}P_{g_k}^n(G_{k}^{c}).
	\end{equation}
	Combining $({\color{blue}\ref{BD1}})$ and $({\color{blue}\ref{BD2}})$, 
	\begin{equation}\label{BD3}
		\sup_{k\in\Lambda_j }	{\rm{E}}\big[\big\| \hat{f}_{n}(\boldsymbol{x})-f_{k}(\boldsymbol{x})\big\|_{p}\big]\geq\frac{\sigma_j}{2}\min\left\{\sqrt{2^{jd}}\exp\left(-K_{2^{jd}}\right), \frac{1}{2}\right\}.
	\end{equation}
	Using Lemma \ref{K0} and the definition of $K_{2^{jd}}$,
	\begin{align*}
		K_{2^{jd}}=\inf_{k'\in \Lambda_{j}}2^{-jd}\sum_{k\neq k'}\mathbb{K}\left(P_{g_{k}}^n,P_{g_{k'}}^n\right)\leq n\delta_j^2.
	\end{align*}
Taking $2^{j}	\sim \left(\frac{ n}{\ln n}\right)^{\frac{1}{2(s-\frac{d}{\eta})+d}}$, we have $n\delta_j^2= n\cdot 2^{-2j(s-\frac{d}{\eta}+\frac{d}{2})}\sim 
	n\left(\frac{n}{\ln n}\right)^{-\frac{2(s-\frac{d}{\eta}+\frac{d}{2})}{2(s-\frac{d}{\eta})+d}}=\ln n$. In addition, there exists a small enough positive constant $c_4$ such that $n\delta_j^2\leq c_4\ln n $ and $\left[4(s-\frac{d}{\eta})+2d\right]^{-1}>c_4$. Hence,
	\begin{align*}
		\sqrt{2^{jd}}e^{-K_{2^{jd}}}\geq \sqrt{2^{jd}}e^{-c_4\ln n} \succ \left(\frac{n}{\ln n}\right)^{\left[4(s-\frac{d}{\eta})+2d\right]^{-1}}n^{-c_4} \geq 1.
	\end{align*}
	This, together with $({\color{blue}\ref{BD3}})$, shows that
	\begin{align*}
		\sup_{k\in\Lambda_j }	{\rm{E}}\Big[\big\| \hat{f}_{n}(\boldsymbol{x}) -f_{k}(\boldsymbol{x}) \big\|_{p}\Big]\gtrsim{\sigma_j}\gtrsim 2^{-j(s-\frac{d}{\eta}+\frac{d}{p})}\sim \left(\frac{\ln n}{n}\right)^\frac{s-\frac{d}{\eta}+\frac{d}{p}}{2(s-\frac{d}{\eta})+d},
	\end{align*}
	which is the desired conclusion.
\end{proof}	

\section{Upper bound estimations}\label{S4}
\quad In this section, we will construct two density estimators by using the wavelet method. The upper bounds over $L^{p}(1\leq p<+\infty)$ risk of those two wavelet estimators are considered with some mild conditions.
\subsection{Linear estimator}
\quad In this section, we develop a linear wavelet estimator using wavelet theory, and discuss its rate of convergence under \(L^p\) risk (\(1 \leq p < \infty\)) in Besov spaces.

For the estimation model ({\color{blue}\ref{1.1}}), a linear wavelet estimator is constructed as follows:
\begin{equation*}\label{2.2}
	\hat{f}_{n}^{\mathrm{lin}}({\boldsymbol{x}}):=\sum_{\textit{\textbf{k}}\in\Lambda_{j_{*}}}\hat{\alpha}_{j_{*},\textit{\textbf{k}}}\Phi_{j_{*},\textit{\textbf{k}}}(\boldsymbol{x}), \boldsymbol{x}\in \Omega.
\end{equation*}
$\hat{\alpha}_{j_*,\textit{\textbf{k}}}$ is defined as follows, but it can be obtained from Lemma \ref{a1} as an unbiased estimator:
\begin{equation}\label{2.4}
	\hat{\alpha}_{j_*,\textit{\textbf{k}}}:=\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}{\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})-\mu_{j_*,\textit{\textbf{k}}},
\end{equation}
with
$\mu_{j_*,\textit{\textbf{k}}}=\int_{\Omega}\frac{\theta}{1-\theta}h(\boldsymbol{x})\Phi_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})d\boldsymbol{x}$. 
Now, in order to prove important theorems for the linear estimator, we firstly give some lemmas.
\begin{lemma}\label{a1}
	Assuming that $\hat{\alpha}_{j_*,\textit{\textbf{k}}}$ is defined as ({\color{blue}\ref{2.4}}), then 
	\begin{gather*}
		{\rm{E}}[\hat{\alpha}_{j_*,\textit{\textbf{k}}}]=\alpha_{j_*,\textit{\textbf{k}}}.
	\end{gather*} 
\end{lemma}
\begin{proof}
	According to 	$\hat{\alpha}_{j_*,\textit{\textbf{k}}}=\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}{\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})-\mu_{j_*,\textit{\textbf{k}}}$ and $\mu_{j_*,\textit{\textbf{k}}}=\int_{\Omega}\frac{\theta}{1-\theta}h(\boldsymbol{x})\Phi_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})d\boldsymbol{x}
	$, we can easily get
	\begin{equation}
		\begin{split}
		{\rm{E}}[\hat{\alpha}_{j_*,\textit{\textbf{k}}}]&={\rm{E}}\Bigg[\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}\Phi_{j_*,\textit{\textbf{k}}}(X_{i})-\mu_{j_*,\textit{\textbf{k}}}\Bigg]\\
		&= {\rm{E}}\Bigg[\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}\Phi_{j_*,\textit{\textbf{k}}}(X_{i})-\int_{\Omega}\frac{\theta}{1-\theta}h(\boldsymbol{x})\Phi_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})d\boldsymbol{x}\Bigg]\\
		&= {\rm{E}}\Bigg[\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}\Phi_{j_*,\textit{\textbf{k}}}(X_{i})\Bigg]-\int_{\Omega}\frac{\theta}{1-\theta}h(\boldsymbol{x})\Phi_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})d\boldsymbol{x}.
		\end{split}
		\label{j42}
	\end{equation}
	Since the random variables $X_{1},X_{2},\cdots, X_{n}$ are i.i.d. and due to its corresponding common density function $g(\boldsymbol{x})$, 
	\begin{align}\label{j43}
		{\rm{E}}\Bigg[\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}\Phi_{j_*,\textit{\textbf{k}}}(X_{i})\Bigg]={\rm{E}}\Bigg[\frac{1}{1-\theta}\Phi_{j_*,\textit{\textbf{k}}}(X_{1})\Bigg]=\int_{\Omega}\frac{1}{1-\theta}g(\boldsymbol{x})\Phi_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})d\boldsymbol{x}.
	\end{align}
	By ({\color{blue}\ref{j42}}),  ({\color{blue}\ref{j43}}), and the model in equation ({\color{blue}{\ref{1.1}}}), which has $g(\boldsymbol{x})=\theta h(\boldsymbol{x})+(1-\theta)f(\boldsymbol{x})$, we know that
	\begin{align*}
		{\rm{E}}[\hat{\alpha}_{j_*,\textit{\textbf{k}}}]
		&=\int_{\Omega}\frac{1}{1-\theta}g(\boldsymbol{x})\Phi_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})d\boldsymbol{x}-\int_{\Omega}\frac{\theta}{1-\theta}h(\boldsymbol{x})\Phi_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})d\boldsymbol{x}\\
		&=\int_{\Omega}\frac{1}{1-\theta}\big[\theta h(\boldsymbol{x})+(1-\theta)f(\boldsymbol{x})\big]\Phi_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})d\boldsymbol{x}-\int_{\Omega}\frac{\theta}{1-\theta}h(\boldsymbol{x})\Phi_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})d\boldsymbol{x}\\
		&=\int_{\Omega}f(\boldsymbol{x})\Phi_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})d\boldsymbol{x}=\alpha_{j_*,\textit{\textbf{k}}}.
	\end{align*}
	The proof of Lemma \ref{a1} is finished.
\end{proof}		
%		To prove the second equation, substitute $\hat{\beta}_{j,\textit{\textbf{k}},u}$ for $\hat{\alpha}_{j_*,\textit{\textbf{k}}}$ and $\Psi_{j,\textit{\textbf{k}},u}$ for $\Phi_{j_*,\textit{\textbf{k}}}$. Using the same method of proof, the conclusion can be reached. The proof of \textbf{Lemma 4.1} is complete.		
{\textbf{\hspace{-0.5cm}Rosenthal's inequality}} \cite{Geng}
 Consider the independent random variables $ X_{1}, \ldots, X_{n} $ with $ {\rm{E}}[X_{i}] =0 $ and $ {\rm{E}}[|X_{i}|^{p}] <\infty $,
\begin{align*}
	{\rm{E}}\left[{{{\left|\sum\limits_{i = 1}^n X_{i}\right|}^{p}}}\right]
	\lesssim
	\begin{cases}
		\sum\limits_{i = 1}^n {\rm{E}}\left[{{{\left| X_{i}\right|}^{p}}}\right]+\left( \sum\limits_{i = 1}^n {\rm{E}}\left[{{{\left| X_{i}\right|}^{2}}}\right]\right) ^{\frac{p}{2}}, &\text{ $p >2$}, \\
		\left(\sum\limits_{i = 1}^n {\rm{E}}\left[{{{\left| X_{i}\right|}^{2}}}\right]\right)^{\frac{p}{2}}, & \text{ $ 1\leq p\leq 2$}.
	\end{cases}
\end{align*}	
	\begin{lemma}\label{a2}	
	Let $2^{j_*d}\leq n$, $0<\theta\leq c_3<1$, and $\hat{\alpha}_{j_*,\textit{\textbf{k}}}$ be defined as in ({\color{blue}\ref{2.4}}). Then, for any $1\leq p<\infty$, we can derive 
	\begin{gather*}
		{\rm{E}}\big[|\hat{\alpha}_{j_*,\textit{\textbf{k}}}-\alpha_{j_*,\textit{\textbf{k}}}|^{p}\big]\lesssim n^{-\frac{p}{2}}.
	\end{gather*} 
\end{lemma}
\begin{proof}
	By the definition of $\hat{\alpha}_{j_*,\textit{\textbf{k}}}$ and Lemma \ref{a1}, we obtain 
	\begin{align*}
		|\hat\alpha_{j_*,\textit{\textbf{k}}}-\alpha_{j_*,\textit{\textbf{k}}}|
		&=\Bigg|\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}{\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})-\mu_{j_*,\textit{\textbf{k}}}-{\rm{E}}\left[\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}{\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})-\mu_{j_*,\textit{\textbf{k}}}\right]\Bigg|\\
		&=\Bigg|\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}{\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})-{\rm{E}}\Bigg[\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}{\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})\Bigg]\Bigg|\\
		&=\frac{1}{n}\Bigg|\sum_{i=1}^{n}\frac{1}{1-\theta}\big({\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})-{\rm{E}}\big[{\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})\big]\big)\Bigg|\\
		&=\frac{1}{n}\bigg|\sum_{i=1}^{n}D_{i}\bigg|,
	\end{align*}
	where $D_{i}:=\frac{1}{1-\theta}\left(\Phi_{j_*,\textit{\textbf{k}}}(X_{i})-{\rm{E}}[\Phi_{j_*,\textit{\textbf{k}}}(X_{i})]\right)$. Obviously, ${\rm{E}}[D_{i}]=0$ and
	\begin{align*}%\label{j44}
		{\rm{E}}\big[|\hat{\alpha}_{j_*,\textit{\textbf{k}}}-\alpha_{j_*,\textit{\textbf{k}}}|^{p}\big]
		&= {\rm{E}}\Bigg[\Bigg|\frac{1}{n}\Bigg|\sum_{i=1}^{n}D_{i}\Bigg|\Bigg|^{p}\Bigg]=\frac{1}{n^{p}}{\rm{E}}\left[\left|\sum_{i=1}^{n}D_{i}\right|^{p}\right].
	\end{align*}
	Using the assumption that $0<\theta\leq c_3<1$, we can easily get
	\begin{equation}
		\begin{split}
		{\rm{E}}[|D_{i}|^p]&={\rm{E}}\left[\left|\frac{1}{1-\theta}\big({\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})-{\rm{E}}\big[{\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})\big]\big)\right|^p\right]\\
		&=\frac{1}{(1-\theta)^p}{\rm{E}}\big[\big|{\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})-{\rm{E}}\big[{\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})\big]\big|^p\big]\\
		&\lesssim {\rm{E}}\left[\big|{\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})-{\rm{E}}\big[{\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})\big]\big|^p\right]\\
		&\lesssim {\rm{E}}\left[\big|{\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})\big|^p\right].
		\end{split}
		\label{j45}
	\end{equation}
	Due to $g(\boldsymbol{x})=\theta h(\boldsymbol{x})+(1-\theta)f(\boldsymbol{x})$, and the density functions $h(\boldsymbol{x})$ and $f(\boldsymbol{x})$ bounded in the defining interval $\Omega$, it is easy to see that
	\begin{align*}%\label{j46}
		{\rm{E}}\left[\big|{\Phi}_{j_*,\textit{\textbf{k}}}(X_{i})\big|^p\right]&=\int_{\Omega}g(\boldsymbol{x})\left|{\Phi}_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})\right|^pd\boldsymbol{x}\notag\\
		&=\int_{\Omega}\theta h(\boldsymbol{x})\left|{\Phi}_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})\right|^pd\boldsymbol{x}+\int_{\Omega}(1-\theta)f(\boldsymbol{x})\left|{\Phi}_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})\right|^pd\boldsymbol{x}\notag\\
		&\lesssim \int_{\Omega}\left|{\Phi}_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})\right|^pd\boldsymbol{x}. 
	\end{align*}
	This, together with Lemma \ref{y2.2} and ({\color{blue}\ref{j45}}), shows that
	\begin{align*}
		{\rm{E}}[|D_{i}|^p]&\lesssim \int_{\Omega} \left|{\Phi}_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})\right|^pd\boldsymbol{x}
		\lesssim 2^{j_*(\frac{d}{2}-\frac{d}{p})p}.
	\end{align*}
	In particular, when $p=2$, ${\rm{E}}[|D_{i}|^2]\lesssim 1$.
	
	By Rosenthal's inequality and $2^{j_*d}\leq n$, when $p>2$, we have
	\begin{align*}
		{\rm{E}}\big[|\hat{\alpha}_{j_*,\textit{\textbf{k}}}-\alpha_{j_*,\textit{\textbf{k}}}|^{p}\big]
		&= \frac{1}{n^{p}}{\rm{E}}\left[\left|\sum_{i=1}^{n}D_{i}\right|^{p}\right]\\
		&\lesssim \frac{1}{n^{p}}\left\{\sum_{i=1}^{n}{\rm{E}}\left[\left|D_{i}\right|^{p}\right]  +\left(\sum_{i=1}^{n}{\rm{E}}\left[D_{i}^{2}\right] \right)^{\frac{p}{2}} \right\}\\
		&\lesssim n^{-p}\left(n2^{j_*(\frac{d}{2}-\frac{d}{p})p}+n^{\frac{p}{2}}\right)\\
		&\lesssim n^{-\frac{p}{2}}.
	\end{align*}
	When $1\leq p\leq 2$, we get 
	\begin{align*}
		{\rm{E}}\big[|\hat{\alpha}_{j_*,\textit{\textbf{k}}}-\alpha_{j_*,\textit{\textbf{k}}}|^{p}\big]
		&= \frac{1}{n^{p}}{\rm{E}}\left[\left|\sum_{i=1}^{n}D_{i}\right|^{p}\right]\\
		&\lesssim \frac{1}{n^{p}}\left(\sum_{i=1}^{n}{\rm{E}}\left[D_{i}^{2}\right] \right)^{\frac{p}{2}}\\
		&\lesssim n^{-\frac{p}{2}}.
	\end{align*}
	In conclusion, when $1\leq p<\infty$, the following result holds:
	\begin{align*}
		{\rm{E}}\big[|\hat{\alpha}_{j_*,\textit{\textbf{k}}}-\alpha_{j_*,\textit{\textbf{k}}}|^{p}\big]\lesssim n^{-\frac{p}{2}}.
	\end{align*}
\end{proof}	

In this position, the convergence rate of the linear wavelet estimator is stated in the following theorem. In the sequel, the following symbol is introduced by $x_+ = max\{x, 0\}$.
\begin{theorem}\label{Theoren1}
	For the model ({\color{blue}{\rm{\ref{1.1}}}}), $f(\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega)$, with $\eta,q\in[1,\infty)$, $s>0$, and $1\leq p<\infty$. If $\{1\leq p<\eta\}$ or $\{1\leq \eta\leq p,s>\frac{d}{\eta}\}$, the linear estimator $\hat{f}_{n}^{\rm lin}(\boldsymbol{x})$ with $2^{j_{*}}\sim n^{\frac{1}{2s^{\prime}+d}}$ and $s^{\prime}=s-d(\frac{1}{\eta}-\frac{1}{p})_{+}$ satisfies
	\begin{align*}
		{\rm{E}}\big[\big\|\hat{f}_{n}^{\rm lin}(\boldsymbol{x})-f(\boldsymbol{x})\big\|_{p}^{p}\big]\lesssim n^{-\frac{s^{\prime}p}{2s^{\prime}+d}}.
	\end{align*}
\end{theorem}
\begin{remark}
	It is easy to see from Theorem \ref{Theoren1} that the convergence rates of this linear wavelet estimator are different under distinct cases. According to the lower bound estimations in Theorem \ref{MT}, this linear wavelet estimator can attain the optimal convergence rate $n^{-\frac{sp}{2s+d}}$ in the case of $1\leq p<\eta$. On the other hand, when $p=2$, our results reduce to the conclusion of Liang and Kou \cite{Liang}.
\end{remark}
\begin{remark}
	Note that the convergence rate of this linear wavelet estimator is $n^{-\frac{(s-d/\eta+d/p)p}{2(s-d/\eta+d/p)+d}}$ when $1\leq \eta\leq p$. Compared with the results of Theorem \ref{MT}, the linear wavelet estimator gets worse with $1\leq \eta\leq p$. On the other hand, the linear wavelet estimator relies on the smoothness parameter $s$ of the unknown density function $f(\boldsymbol{x})$. Hence, this linear estimator is not adaptive. In order to overcome those shortages, a nonlinear wavelet estimator will be proposed by the hard thresholding method in the following section.
\end{remark}
\begin{proof}[Proof of Theorem~\ref{Theoren1}]
%{\textbf{\hspace{-0.5cm}Proof of Theorem \ref{Theoren1}:}}
Based on the definitions of $\hat{f}_{n}^{\rm lin}(\boldsymbol{x})$ and orthogonal projection operators, we obtain
\begin{equation}
	\begin{split}
	{\rm{E}}\big[\big\|\hat{f}_{n}^{\rm lin}(\boldsymbol{x})-f(\boldsymbol{x})\big\|_{p}^{p}\big] &= {\rm{E}}\big[\big\|\hat{f}_{n}^{\rm lin}(\boldsymbol{x})-{\rm P}_{V_{j_*}}f(\boldsymbol{x})+{\rm P}_{V_{j_*}}f(\boldsymbol{x})-f(\boldsymbol{x})\big\|_{p}^{p}\big]\\
	&\lesssim {\rm{E}}\big[\big\|\hat{f}_{n}^{\rm lin}(\boldsymbol{x})-{\rm P}_{V_{j_*}}f(\boldsymbol{x})\big\|_{p}^{p}\big]+\big\|{\rm P}_{V_{j_*}}f(\boldsymbol{x})-f(\boldsymbol{x})\big\|_{p}^{p}.
	\end{split}
	\label{4.3}
\end{equation}
By Lemma \ref{y2.2}, it is easy to obtain		
\begin{align*}
	{\rm{E}}\big[\big\|\hat{f}_{n}^{\rm lin}(\boldsymbol{x})-{\rm P}_{V_{j_*}}f(\boldsymbol{x})\big\|_{p}^{p}\big]&=
	{\rm{E}}\left[\left\|\sum_{\textit{\textbf{k}}\in \Lambda_{j_{*}}}\left(\hat{\alpha}_{j_*,\textit{\textbf{k}}}-\alpha_{j_*,\textit{\textbf{k}}}\right)\Phi_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})\right\|_{p}^{p}\right]\\
	&\lesssim 2^{j_{*}(\frac{d}{2}-\frac{d}{p})p}\sum_{\textit{\textbf{k}}\in \Lambda_{j_*}}{\rm{E}}\left[\left|\hat{\alpha}_{j_*,\textit{\textbf{k}}}-\alpha_{j_*,\textit{\textbf{k}}}\right|^p\right].
\end{align*}
Furthermore, using Lemma \ref{a2}, $\left|\Lambda_{j_*}\right|\sim 2^{j_{*}d}$, and $2^{j_{*}}\sim n^{\frac{1}{2s^{\prime}+d}}$,
\begin{align}\label{4.4}
	{\rm{E}}\big[\big\|\hat{f}_{n}^{\rm lin}(\boldsymbol{x})-{\rm P}_{V_{j_*}}f(\boldsymbol{x})\big\|_{p}^{p}\big]
	\lesssim  2^{j_{*}(\frac{d}{2}-\frac{d}{p})p}\sum_{\textit{\textbf{k}}\in \Lambda_{j_*}}n^{-\frac{p}{2}}
	\lesssim  2^{j_{*}(\frac{d}{2}-\frac{d}{p})p}\cdotp2^{j_{*}d}\cdotp n^{-\frac{p}{2}}
	\sim n^{-\frac{s^{\prime}p}{2s^{\prime}+d}}.
\end{align}
When $1\leq p<\eta$, $s^{\prime}=s-d(\frac{1}{\eta}-\frac{1}{p})_{+}=s$. By the Hölder inequality and $f(\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega)$, there are
\begin{align*}
	\|{\rm P}_{V_{j_*}}f(\boldsymbol{x})-f(\boldsymbol{x})
	\|_{p}^{p}&=\int_{\Omega}|{\rm P}_{V_{j_*}}f(\boldsymbol{x})-f(\boldsymbol{x})|^{p}\cdot1d\boldsymbol{x}\\
	&\leq \left(\int_{\Omega}|{\rm P}_{V_{j_*}}f(\boldsymbol{x})-f(\boldsymbol{x})|^{p\cdot\frac{{\eta}}{p}}d\boldsymbol{x}\right)^\frac{{p}}{\eta}\left(\int_{\Omega} 1^{\frac{\eta}{\eta - p}}d\boldsymbol{x}\right)^{1 - \frac{p}{\eta}}\\
	&\lesssim\left(\int_{\Omega}|{\rm P}_{V_{j_*}}f(\boldsymbol{x})-f(\boldsymbol{x})|^{\eta}d\boldsymbol{x}\right)^\frac{p}{\eta}\\
	&=\|{\rm P}_{V_{j_*}}f(\boldsymbol{x})-f(\boldsymbol{x})
	\|_{\eta}^{p}.
\end{align*}
Due to Lemma \ref{y}, the definition of  ${\rm P}_{V_{j_*}}f(\boldsymbol{x})$, and $f(\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega)$, it is easy to see that
\begin{equation}
	\begin{split}
	\|{\rm P}_{V_{j_*}}f(\boldsymbol{x})-f(\boldsymbol{x})
	\|_{\eta}&=\left\|\sum_{j=j_{*}}^{\infty}\sum_{u=1}^{2^d-1}\sum_{\textit{\textbf{k}}\in \Lambda_{j_{*}}}\beta_{j,\textit{\textbf{k}},u}{\Psi}_{j,\textit{\textbf{k}},u}(\boldsymbol{x})\right\|_{\eta}\\
	&\leq\sum_{j=j_{*}}^{\infty}\Big\|{\rm P}_{V_{j+1}}f(\boldsymbol{x})-{\rm P}_{V_{j}}f(\boldsymbol{x})\Big\|_{\eta}\\
	&\lesssim \sum_{j=j_{*}}^{\infty}2^{-js}.
	\end{split}
	\label{PP}
\end{equation}
Therefore, according to ({\color{blue}\ref{PP}}), there are	
\begin{align}\label{p1}
	\|{\rm P}_{V_{j_*}}f(\boldsymbol{x})-f(\boldsymbol{x})
	\|_{p}^{p}\lesssim \sum_{j=j_{*}}^{\infty}2^{-jsp}\lesssim2^{-j_{*}sp}\sim  n^{-\frac{sp}{2s+d}}.
\end{align}	
When $1\leq \eta\leq p$ and $s>\frac{d}{\eta}$, $s^{\prime}=s-d(\frac{1}{\eta}-\frac{1}{p})_{+}$. 
The Besov spaces embedding theorem gives $B_{\eta,q}^{s}(\Omega)\subseteq B_{p,q}^{s-\frac{d}{\eta}+\frac{d}{p}}(\Omega)$ and 
\begin{align}\label{p2}
	\|{\rm P}_{V_{j_*}}f(\boldsymbol{x})-f(\boldsymbol{x})
	\|_{p}^{p}\lesssim \sum_{j=j_{*}}^{\infty}2^{-js^{\prime}p}\lesssim2^{-j_{*}s^{\prime}p}\sim n^{-\frac{s^{\prime}p}{2s^{\prime}+d}}.
\end{align}
Hence, according to ({\color{blue}\ref{p1}}) and  ({\color{blue}\ref{p2}}), for each $1\leq p<\infty$,
\begin{align}\label{p3}
	\|{\rm P}_{V_{j_*}}f(\boldsymbol{x})-f(\boldsymbol{x})
	\|_{p}^{p}\lesssim n^{-\frac{s^{\prime}p}{2s^{\prime}+d}}.
\end{align} 

Finally, owing to ({\color{blue}\ref{4.3}}), ({\color{blue}\ref{4.4}}), ({\color{blue}\ref{p3}}), we prove that
\begin{align*}
	{\rm{E}}\big[\big\|\hat{f}_{n}^{\rm lin}(\boldsymbol{x})-f(\boldsymbol{x})\big\|_{p}^{p}\big]\lesssim n^{-\frac{s^{\prime}p}{2s^{\prime}+d}}.
\end{align*} 
\end{proof}

\subsection{Nonlinear wavelet estimator}
\quad This section constructs a nonlinear wavelet estimator based on the thresholding function, and discusses the convergence rate under \(L^p\) (\(1 \leq p < \infty\)) risk in Besov spaces. Firstly, we define the nonlinear wavelet estimator by
\begin{equation*}\label{2.3}
	\hat{f}_{n}^{\mathrm{non}}(\boldsymbol{x}) := \sum\limits_{\textit{\textbf{k}} \in {\Lambda _{{j_*}}}} {{\alpha _{{j_*},\textit{\textbf{k}}}}} {\Phi _{{j_*},\textit{\textbf{k}}}}(\boldsymbol{x}) + \sum\limits_{j = {j_*}}^{{{j}_2}} {\sum\limits_{u = 1}^{{2^d} - 1} {\sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} {\hat\beta _{j,\textit{\textbf{k}},u}}}} {\mathbb{I}_{\{ |{{\hat \beta }_{j,\textit{\textbf{k}},u}}| \ge \kappa {t_n}\} }}{\Psi}_{j,\textit{\textbf{k}},u}(\boldsymbol{x}), \boldsymbol{x}\in \Omega,
\end{equation*}
with ${t_n}:= \sqrt{\frac{\ln n}{n}}$.
In addition, we develop an estimator for the coefficients $\beta_{j,\textit{\textbf{k}},u}$
\begin{equation}\label{2.5}
	\hat{\beta}_{j,\textit{\textbf{k}},u}:=\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})-\gamma_{j,\textit{\textbf{k}},u},
\end{equation}
where  $\gamma_{j,\textit{\textbf{k}},u}=\int_{\Omega}\frac{\theta}{1-\theta}h(\boldsymbol{x})\Psi_{j,\textit{\textbf{k}},u}(\boldsymbol{x})d\boldsymbol{x}$. This is the unbiased estimate of ${\beta}_{j,\textit{\textbf{k}},u}$ that converges to ${\beta}_{j,\textit{\textbf{k}},u}$ in ${L^p}{(\Omega)}$, which follows from Lemma \ref{b1}. 
To facilitate the proof of Theorem \ref{Theoren2}, we first introduce several lemmas.	
\begin{lemma}\label{b1}
	Assuming that $\hat{\beta}_{j,\textit{\textbf{k}},u}$ is defined as in ({\color{blue}\ref{2.5}}), we obtain
	\begin{gather*}
		{\rm{E}}[\hat{\beta}_{j,\textit{\textbf{k}},u}]=\beta_{j,\textit{\textbf{k}},u}.
	\end{gather*} 
\end{lemma}
\begin{proof}
	Since  $\hat{\beta}_{j,\textit{\textbf{k}},u}=\frac{1}{n}\sum\limits_{i=1}^{n}\frac{1}{1-\theta}{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})-\gamma_{j,\textit{\textbf{k}},u}$ and $\gamma_{j,\textit{\textbf{k}},u}=\int_{\Omega}\frac{\theta}{1-\theta}h(\boldsymbol{x})\Psi_{j,\textit{\textbf{k}},u}(\boldsymbol{x})d\boldsymbol{x}$, by the properties of expectation, one has
	\begin{align*}
		{\rm{E}}[\hat{\beta}_{j,\textit{\textbf{k}},u}]&={\rm{E}}\Bigg[\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})-\gamma_{j,\textit{\textbf{k}},u}\Bigg]	\\
		&= {\rm{E}}\Bigg[\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})\Bigg]-\gamma_{j,\textit{\textbf{k}},u}.
	\end{align*}  
	Since it is known that $X_{1},X_{2},\cdots, X_{n}$ are i.i.d. random variables with a common density function $g(\boldsymbol{x})$, we can obtain
	\begin{align*}
		{\rm{E}}\Bigg[\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})\Bigg]={\rm{E}}\Bigg[\frac{1}{1-\theta}{\Psi}_{j,\textit{\textbf{k}},u}(X_{1})\Bigg]=\int_{\Omega}\frac{1}{1-\theta}g(\boldsymbol{x})\Psi_{j,\textit{\textbf{k}},u}(\boldsymbol{x})d\boldsymbol{x}.
	\end{align*}
	Combining the above arguments and the mixture distribution $g(\boldsymbol{x})=\theta h(\boldsymbol{x})+(1-\theta)f(\boldsymbol{x})$, we can have
	\begin{align*}
		{\rm{E}}[\hat{\beta}_{j,\textit{\textbf{k}},u}]&=\int_{\Omega}\frac{1}{1-\theta}\big[\theta h(\boldsymbol{x})+(1-\theta)f(\boldsymbol{x})\big]\Psi_{j,\textit{\textbf{k}},u}(\boldsymbol{x})d\boldsymbol{x}-\int_{\Omega}\frac{\theta}{1-\theta}h(\boldsymbol{x})\Psi_{j,\textit{\textbf{k}},u}(\boldsymbol{x})d\boldsymbol{x}\\
		&=\int_{\Omega}f(\boldsymbol{x})\Psi_{j,\textit{\textbf{k}},u}(\boldsymbol{x})d\boldsymbol{x}=\beta_{j,\textit{\textbf{k}},u}.
	\end{align*}
	Lemma \ref{b1} is proved.
\end{proof}	
\begin{lemma}\label{b2}	
	Let $2^{jd}\leq n$, $0<\theta\leq c_3<1$, and $\hat{\beta}_{j,\textit{\textbf{k}},u}$ be defined as in ({\color{blue}\ref{2.5}}). Then, for any $1\leq p<\infty$, we can derive 
	\begin{gather*}
		{\rm{E}}\big[|\hat{\beta}_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}|^{p}\big]\lesssim n^{-\frac{p}{2}}.
	\end{gather*} 
\end{lemma}	
\begin{proof}
	By Lemma \ref{b1}, we have ${\rm{E}}[\hat{\beta}_{j,\textit{\textbf{k}},u}]=\beta_{j,\textit{\textbf{k}},u}$. Therefore,
	\begin{equation} 
		\begin{split}
		|\hat\beta_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}|
		&=\Bigg|\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})-\gamma_{j,\textit{\textbf{k}},u}-{\rm{E}}\Bigg[\frac{1}{n}\sum_{i=1}^{n}\frac{1}{1-\theta}{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})-\gamma_{j,\textit{\textbf{k}},u}\Bigg]\Bigg|\\
		&=\frac{1}{n}\Bigg|\sum_{i=1}^{n}\frac{1}{1-\theta}\big({\Psi}_{j,\textit{\textbf{k}},u}(X_{i})-{\rm{E}}[{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})]\big)\Bigg|\\
		&=\frac{1}{n}\bigg|\sum_{i=1}^{n}G_{i}\bigg|,
		\end{split}
		\label{G_i}
	\end{equation}
	with $G_{i}:=\frac{1}{1-\theta}\big(\Psi_{j,\textit{\textbf{k}},u}(X_{i})-{\rm{E}}[\Psi_{j,\textit{\textbf{k}},u}(X_{i})]\big)$. 	Obviously, ${\rm{E}}[G_{i}]=0$ and 
	\begin{align*}
		{\rm{E}}\big[|\hat{\beta}_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}|^{p}\big]
		= {\rm{E}}\Bigg[\Bigg|\frac{1}{n}\Bigg|\sum_{i=1}^{n}G_{i}\Bigg|\Bigg|^{p}\Bigg]
		= \frac{1}{n^{p}}{\rm{E}}\left[\left|\sum_{i=1}^{n}G_{i}\right|^{p}\right].
	\end{align*}
	From the condition $0<\theta\leq c_3<1$, it is known that
	\begin{align*}
		{\rm{E}}[|G_{i}|^p]&={\rm{E}}\left[\left|\frac{1}{1-\theta}\big({\Psi}_{j,\textit{\textbf{k}},u}(X_{i})-{\rm{E}}[{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})]\big)\right|^p\right]\\
		&=\frac{1}{(1-\theta)^p}{\rm{E}}\left[\big|{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})-{\rm{E}}[{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})]\big|^p\right]\\
		&\lesssim {\rm{E}}\left[\big|{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})-{\rm{E}}[{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})]\big|^p\right]\\
		&\lesssim {\rm{E}}\left[\big|{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})\big|^p\right].
	\end{align*}		
	The mixture model $g(\boldsymbol{x})=\theta h(\boldsymbol{x})+(1-\theta)f(\boldsymbol{x})$ in ({\color{blue}\ref{1.1})}, combined with $\Psi(\boldsymbol{x})$'s characteristics, yields
	\begin{align*}
		{\rm{E}}\left[\big|{\Psi}_{j,\textit{\textbf{k}},u}(X_{i})\big|^p\right]
		&=\int_{\Omega}g(\boldsymbol{x})\left|{\Psi}_{j,\textit{\textbf{k}},u}(\boldsymbol{x})\right|^pd\boldsymbol{x}\\
		&=\int_{\Omega}\theta h(\boldsymbol{x})\left|{\Psi}_{j,\textit{\textbf{k}},u}(\boldsymbol{x})\right|^pd\boldsymbol{x}+\int_{\Omega}(1-\theta)f(\boldsymbol{x})\left|{\Psi}_{j,\textit{\textbf{k}},u}(\boldsymbol{x})\right|^pd\boldsymbol{x}\\
		&\lesssim \int_{\Omega}\left|{\Psi}_{j,\textit{\textbf{k}},u}(\boldsymbol{x})\right|^pd\boldsymbol{x}.		
	\end{align*}	
	Then we can obtain ${\rm{E}}[|G_{i}|^p]\lesssim \int_{\Omega}\left|{\Psi}_{j,\textit{\textbf{k}},u}(\boldsymbol{x})\right|^pd\boldsymbol{x}$ and
	\begin{align*}
		{\rm{E}}[|G_{i}|^p]\lesssim \int_{\Omega}\left|{\Psi}_{j,\textit{\textbf{k}},u}(\boldsymbol{x})\right|^pd\boldsymbol{x} \lesssim 2^{j(\frac{d}{2}-\frac{d}{p})p}.
	\end{align*}
	Especially, when $p=2$, ${\rm{E}}[|G_{i}|^2]\lesssim 1$.
	
	Applying Rosenthal's inequality and given that $2^{jd}\leq n$, for $p>2$, we have
	\begin{align*}
		{\rm{E}}[|\hat\beta_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}|^{p}]
		&= \frac{1}{n^{p}}{\rm{E}}\left[\left|\sum_{i=1}^{n}G_{i}\right|^{p}\right]\\
		&\lesssim \frac{1}{n^{p}}\left\{\sum_{i=1}^{n}{\rm{E}}\left[\left|G_{i}\right|^{p}\right]  +\left(\sum_{i=1}^{n}{\rm{E}}\left[G_{i}^{2}\right] \right)^{\frac{p}{2}} \right\}\\
		&\lesssim n^{-p}\left(n2^{j(\frac{d}{2}-\frac{d}{p})p}+n^{\frac{p}{2}}\right)\\
		&\lesssim n^{-\frac{p}{2}}.
	\end{align*}
	When $1\leq p\leq 2$, we get 
	\begin{align*}
		{\rm{E}}[|\hat\beta_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}|^{p}]
		&= \frac{1}{n^{p}}{\rm{E}}\left[\left|\sum_{i=1}^{n}G_{i}\right|^{p}\right]\\
		&\lesssim \frac{1}{n^{p}}\left(\sum_{i=1}^{n}{\rm{E}}\left[G_{i}^{2}\right] \right)^{\frac{p}{2}}\\
		&\lesssim n^{-\frac{p}{2}}.
	\end{align*}
	Hence, we can obtain
	\begin{align*}
		{\rm{E}}[|\hat\beta_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}|^{p}]\lesssim n^{-\frac{p}{2}}.
	\end{align*}
\end{proof}	
{\textbf{\hspace{-0.5cm}Bernstein's inequality}}\cite{Härdle}
~Let $ X_{1}, \ldots, X_{n} $ be independent random variables. We assume that $ {\rm{E}}[X_{i}] =0 $, $ |{{X_i}}|< c $, and $  \sigma^{2}=\frac{1}{n}{\sum\limits_{i = 1}^n \mathbf{var}({X_i}}) $. Under these conditions, for each $ \epsilon >0 $,
\begin{align*}
	{\rm Pr}\left({\frac{1}{n}\left|{\sum\limits_{i = 1}^n {{X_i}}}\right| \ge \epsilon }\right)
	\le 2 \exp\left\{{ - \frac{n \epsilon^{2}}{{2({\sigma^{2} + c\epsilon /{3}})}}}\right\}.
\end{align*}
%		\begin{lemma}
	%		(Bernstein's inequality, {\color{blue}Härdle et al., 1998})
	%			~Let $ X_{1}, \ldots, X_{n} $ be independent random variables. We assume that $ {\rm{E}}[X_{i}] =0 $, $ |{{X_i}}|< c $, and $  \sigma^{2}=\frac{1}{n}{\sum\limits_{i = 1}^n \mathbf{var}({X_i}}) $. Under these conditions, for each $ \epsilon >0 $,
	%			\begin{align*}
		%				{\rm Pr}\left({\frac{1}{n}\left|{\sum\limits_{i = 1}^n {{X_i}}}\right| \ge \epsilon }\right)
		%				\le 2 \exp\left\{{ - \frac{n \epsilon^{2}}{{2({\sigma^{2} + c\epsilon /{3}})}}}\right\}.
		%			\end{align*}
	%		\end{lemma}
\begin{lemma}\label{e1}	
	Let $2^{jd}\lesssim \frac{n}{\ln n}$,  $0<\theta\leq c_3<1$, and $p\in [1, \infty)$. The estimator $\hat{\beta}_{j,\textit{\textbf{k}},u}$ is given by ({\color{blue}\ref{2.5}}). For some constant $\kappa>1$, we have
	\begin{gather*}
		{\rm Pr}\big (|\hat\beta_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}|\geq \kappa t_{n}\big )\lesssim n^{-2p}.
	\end{gather*}
\end{lemma}	
\begin{proof}
	According to ({\color{blue}\ref{G_i}}), note that $\big|\hat\beta_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\big|=\frac{1}{n}\big|\sum_{i=1}^{n}G_{i}\big|$ and
	\begin{align}\label{Pr1}
		\bigg\{\left|\hat\beta_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\right|\geq\kappa t_{n}\bigg\}\subseteq\bigg\{\frac{1}{n}\bigg|\sum_{i=1}^{n}G_{i}\bigg|\geq\kappa t_{n}\bigg\}.
	\end{align}
	It is known that $G_{i}:=\frac{1}{1-\theta}\big(\Psi_{j,\textit{\textbf{k}},u}(X_{i})-{\rm{E}}[\Psi_{j,\textit{\textbf{k}},u}(X_{i})]\big)$, which gives ${\rm{E}}[G_{i}]=0$.  		
	Because $h(\boldsymbol{x})$, $f(\boldsymbol{x})$, and $\Phi(\boldsymbol{x})$ are all bounded functions, 
	\begin{align*}
		|G_{i}|
		\leq&\, \big|\Psi_{j,\textit{\textbf{k}},u}(X_{i})\big|+\big|{\rm{E}}[\Psi_{j,\textit{\textbf{k}},u}(X_{i})]\big|\\
		=&\,	\bigg|2^{\frac{jd}{2}}\Psi_u(2^jX_{i}-\boldsymbol{k})\bigg|+\left| \int_{\Omega}g(\boldsymbol{x})2^{\frac{jd}{2}}{\Psi_u}(2^j\boldsymbol{x}-\boldsymbol{k})d\boldsymbol{x}\right| \\
		=&\, 	\bigg|2^{\frac{jd}{2}}\Psi_u(2^jX_{i}-\boldsymbol{k})\bigg|+\left| \int_{\Omega}\theta h(\boldsymbol{x})2^{\frac{jd}{2}}{\Psi_u}(2^j\boldsymbol{x}-\boldsymbol{k})d\boldsymbol{x}\right|\\ 
		&+\left| \int_{\Omega}(1-\theta)f(\boldsymbol{x})2^{\frac{jd}{2}}{\Psi_u}(2^j\boldsymbol{x}-\boldsymbol{k})d\boldsymbol{x}\right|
		\lesssim 2^{\frac{jd}{2}}.
	\end{align*}
	By the condition $2^{jd}\lesssim \frac{n}{\ln n}$, 
	\begin{align}\label{Pr2}
		|G_{i}|\lesssim \sqrt{\frac{n}{\ln n}}.
	\end{align} 
	Due to the proof of Lemma \ref{b2},
	\begin{align}\label{Pr3}
		\sigma^2=\frac{1}{n}\sum_{i=1}^{n}\mathbf{var}[G_{i}]=\frac{1}{n}\sum_{i=1}^{n}{\rm{E}}[G^2_{i}]\lesssim 1.
	\end{align} 
	According to ({\color{blue}\ref{Pr1}}) --({\color{blue}\ref{Pr3}}) and Bernstein's inequality, we have
	\begin{align*}
		{\rm Pr}\left(\frac{1}{n}\bigg|\sum_{i=1}^{n}G_{i}\bigg|\geq\kappa t_{n}\right)&\lesssim \exp\left\{-\frac{n\kappa^{2}t_{n}^{2}}{2\big(\sigma^{2}+\big(\kappa t_{n}\sqrt{\frac{n}{\ln n}}\big)/3\big)}\right\}\\
		&\lesssim \exp\left\{-\frac{(\ln n)\kappa^{2}}{2(\sigma^{2}+\kappa/3)}\right\}\\
		&\lesssim n^{-\frac{k^{2}}{2(\sigma^{2}+k/3)}}.
	\end{align*}
	Then, a sufficiently large $\kappa$ can be selected such that  
	\begin{align*}
		{\rm Pr}(|\hat\beta_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}|\geq \kappa t_{n})\lesssim n^{-\frac{k^{2}}{2(\sigma^{2}+k/3)}}\lesssim n^{-2p}.
	\end{align*}
\end{proof}	
Under the above conditions, the convergence rate of the nonlinear wavelet estimator is characterized by the following theorem.	
\begin{theorem}\label{Theoren2}
	For the model ({\color{blue}{\rm{\ref{1.1}}}}), $f(\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega)$ with $\eta,q\in[1,\infty)$, $s>0$ and $1\leq p<\infty$. If $\{1\leq p<\eta\}$ or $\{1\leq \eta\leq p,s>\frac{d}{\eta}\}$,  the nonlinear estimator $ \hat{f}_{n}^{\rm non}(\boldsymbol{x})$  with $2^{j_{2}}\sim (n/\ln n)^{\frac{1}{d}}$ and $2^{j_{*}}\sim n^{\frac{1}{2m+d}}(m>s)$ has		
	\begin{align*}
		{\rm{E}}\big[\big\|\hat{f}_{n}^{\rm non}(\boldsymbol{x})-f(\boldsymbol{x})
		\big\|_{p}^{p}\big]\lesssim (\ln n)^{\frac{3p}{2}}n^{-\delta p},
	\end{align*}
	where
	\begin{align*}
		\delta= min\left\{\frac{s}{2s+d}, \frac{s-\frac{d}{\eta}+\frac{d}{p}}{2(s-\frac{d}{\eta})+d}\right\} =
		\begin{cases}
			\frac{s}{2s+d},
			& \eta>\frac{pd}{2s+d}, \\
			\frac{s-\frac{d}{\eta}+\frac{d}{p}}{2(s-\frac{d}{\eta})+d},
			& \eta\leq\frac{pd}{2s+d}.
		\end{cases}
	\end{align*}		
\end{theorem}
\begin{remark}
	Taking into account the lower bound estimations in Theorem \ref{MT}, note that the convergence rate of this nonlinear wavelet estimator matches the optimal convergence rate up to an $(\ln n)^{\frac{3p}{2}}$ factor. In addition, the results of this theorem are consistent with the convergence rate of Liang and Kou \cite{Liang}, in the case of $p=2$.
\end{remark}			
\begin{remark}
	While both linear and nonlinear wavelet estimators achieve optimal convergence rates when $\eta > p$, the nonlinear estimator demonstrates superior convergence performance for $p\leq \eta$. Notably, the nonlinear wavelet estimator exhibits adaptivity as its construction does not require prior knowledge of the smoothness parameters of the unknown density function.
\end{remark}	

\begin{remark}
	Note that the mixture density model ({\color{blue}\ref{1.1}}) reduces to the nonparametric estimation problem of Kou and Chen \cite{Kou}, when the density function $h(\boldsymbol{x})\equiv1$, which means that the function $h(\boldsymbol{x})$ denotes the density function of uniform distribution. Then, those upper bound estimations of two wavelet estimators in Theorems \ref{Theoren1} and \ref{Theoren2} are the same as the results of Kou and Chen \cite{Kou}. It should be pointed out that Kou and Chen \cite{Kou} only focus on the upper bound estimations of the wavelet method, they do not study the lower bound estimations. In this paper, in order to discuss the optimality of wavelet estimators, the lower bound estimations of wavelet estimators are investigated in Section \ref{S3}.
\end{remark}
%{\textbf{\hspace{-0.5cm}Proof}}	
\begin{proof}
By the definitions of $\hat{f}_{n}^{\rm non}(\boldsymbol{x})$, $\hat{f}_{n}^{\rm lin}(\boldsymbol{x})$, ${\rm P}_{V_{j}}f(\boldsymbol{x})$, and ({\color{blue}{\rm{\ref{2.1}}}}),
\begin{align*}
	\hat{f}_{n}^{\rm non}(\boldsymbol{x})-f(\boldsymbol{x})
	=&\,
	\sum_{\textit{\textbf{k}}\in\Lambda_{j_{*}}}(\hat\alpha_{j_{*},\textit{\textbf{k}}}-\alpha_{j_{*},\textit{\textbf{k}}})\Phi_{j_{*},\textit{\textbf{k}}}(\boldsymbol{x})-\sum\limits_{j = {j_2+1}}^{\infty} {\sum\limits_{u = 1}^{{2^d} - 1} {\sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} {\beta _{j,\textit{\textbf{k}},u}}}} {\Psi}_{j,\textit{\textbf{k}},u}(\boldsymbol{x})\\
	&+\sum\limits_{j = {j_*}}^{{{j}_2}} {\sum\limits_{u = 1}^{{2^d} - 1} {\sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} {\hat\beta _{j,\textit{\textbf{k}},u}}}} {\mathbb{I}_{\{ |{{\hat \beta }_{j,\textit{\textbf{k}},u}}| \ge \kappa {t_n}\} }}{\Psi}_{j,\textit{\textbf{k}},u}(\boldsymbol{x})-\sum\limits_{j = {j_*}}^{j_2} {\sum\limits_{u = 1}^{{2^d} - 1} {\sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} {\beta _{j,\textit{\textbf{k}},u}}}} {\Psi}_{j,\textit{\textbf{k}},u}(\boldsymbol{x})\\
	=&\,\big(\hat{f}_{n}^{\rm lin}(\boldsymbol{x})-{\rm P}_{V_{j_*}} f(\boldsymbol{x})\big)-\big(f(\boldsymbol{x})-{\rm P}_{V_{j_{2}+1}}  f(\boldsymbol{x})\big)\\
	&+\sum\limits_{j = {j_*}}^{{j_2}} {\sum\limits_{u = 1}^{{2^d} - 1} {\sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}\big( {\hat\beta _{j,\textit{\textbf{k}},u}}}} {\mathbb{I}_{\{ |{{\hat \beta }_{j,\textit{\textbf{k}},u}}| \ge \kappa {t_n}\} }}-\beta _{j,\textit{\textbf{k}},u}\big){\Psi}_{j,\textit{\textbf{k}},u}(\boldsymbol{x}).
\end{align*}
Hence, 
\begin{align}\label{47}
	{\rm{E}}\big[\big\| \hat{f}_{n}^{\rm non}(\boldsymbol{x})-f(\boldsymbol{x})
	\big\|_{p}^{p}\big]\lesssim A+B+C.
\end{align}
In the above equality,
\begin{align*}
	A:=&{\rm{E}}\left[\left\| \hat f_n^{\rm lin}(\boldsymbol{x})- {\rm P}_{V_{j_*}} f(\boldsymbol{x})\right\| ^{p}_{p} \right],\\
	B:=&\big\|f(\boldsymbol{x})-{\rm P}_{V_{j_{2}+1}}f(\boldsymbol{x})  \big\| ^{p}_{p},\\
	C:=&{\rm{E}}\left[ \left\|  \sum\limits_{j = {j_*}}^{{j_2}} \sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} {\left({{{\hat\beta }_{j,\textit{\textbf{k}},u}}{\mathbb{I}_{\{{|{{{\hat \beta }_{j,\textit{\textbf{k}},u}}}| \ge \kappa {t_n}}\}}} - {\beta _{j,\textit{\textbf{k}},u}}}\right)}{\Psi _{j,\textit{\textbf{k}},u}}(\boldsymbol{x})\right\| ^{p}_{p}\right].
\end{align*}
\par For $A$, it follows from Lemma \ref{y2.2} that
\begin{align*}
	{\rm{E}}\left[\left\| \hat f_n^{\rm lin}(\boldsymbol{x})- {\rm P}_{j_*} f(\boldsymbol{x})\right\| ^{p}_{p} \right]
	&=	{\rm{E}}\left[\left\|\sum_{\textit{\textbf{k}}\in \Lambda_{j_{*}}}\left(\hat{\alpha}_{j_*,\textit{\textbf{k}}}-\alpha_{j_*,\textit{\textbf{k}}}\right)\Phi_{j_*,\textit{\textbf{k}}}(\boldsymbol{x})\right\|^{p}_{p}\right]\\
	&\lesssim 2^{j_{*}(\frac{d}{2}-\frac{d}{p})p}\sum_{\textit{\textbf{k}}\in \Lambda_{j_*}}{\rm{E}}\big[\left|\hat{\alpha}_{j_*,\textit{\textbf{k}}}-\alpha_{j_*,\textit{\textbf{k}}}\right|^p\big].
\end{align*}
By  Lemma \ref{a2}, we have  $\left|\Lambda_{j_*}\right|\sim2^{j_{*}d}$, with $2^{j_{*}}\sim n^{\frac{1}{2m+d}}(m>s)$,
\begin{align}\label{48}
	A\lesssim  2^{j_{*}(\frac{d}{2}-\frac{d}{p})p}\sum_{\textit{\textbf{k}}\in \Lambda_{j_*}}n^{-\frac{p}{2}}\lesssim  2^\frac{{j_{*}d}}{2}\cdot n^{-\frac{p}{2}}\sim n^{-\frac{mp}{2m+d}}\leq n^{-\frac{sp}{2s+d}}\leq n^{-\delta p}.
\end{align}
\par For $B$, when $1\leq p<\eta$, $s^{\prime}=s-d(\frac{1}{\eta}-\frac{1}{p})_{+}=s$. By the Hölder inequality and $f(\boldsymbol{x})\in B_{\eta,q}^{s}(\Omega)$, 	
\begin{align*}
	\|f(\boldsymbol{x})-{\rm P}_{V_{j_{2}+1}}f(\boldsymbol{x})\|_{p}^{p}&=\int_{\Omega}|f(\boldsymbol{x})-{\rm P}_{V_{j_{2}+1}}f(\boldsymbol{x})|^{p}\cdot1d\boldsymbol{x}\\
	&\leq \left(\int_{\Omega}|f(\boldsymbol{x})-{\rm P}_{V_{j_{2}+1}}f(\boldsymbol{x})|^{p\cdot\frac{{\eta}}{p}}d\boldsymbol{x}\right)^\frac{{p}}{\eta}\left(\int_{\Omega} 1^{1^{\frac{p}{p - 2}}}d\boldsymbol{x}\right)^{1 - \frac{2}{p}}\\
	&\lesssim\left(\int_{\Omega}|f(\boldsymbol{x})-{\rm P}_{V_{j_{2}+1}}f(\boldsymbol{x})|^{\eta}d\boldsymbol{x}\right)^\frac{p}{\eta}\\
	&=\|f(\boldsymbol{x})-{\rm P}_{V_{j_{2}+1}}f(\boldsymbol{x})\|_{\eta}^{p}.
\end{align*}
Then, the same method is used as in the proof of ({\color{blue}{\ref{PP}}}), and we can obtain
\begin{align}\label{B1}
	B\lesssim \sum_{j=j_{2}+1}^{\infty}2^{-jsp}\lesssim 2^{-j_{2}sp}\sim \left(\frac{\ln n}{n}\right)^{\frac{sp}{d}}\leq \left(\frac{\ln n}{n}\right)^{\frac{sp}{2s+d}}\leq \left(\frac{\ln n}{n}\right)^{\delta p}.
\end{align}
Furthermore, when $1\leq \eta\leq p$ and $s>\frac{d}{\eta}$, $s^{\prime}=s-\frac{d}{\eta}+\frac{d}{p}$.
The Besov spaces embedding theorem gives $B_{\eta,q}^{s}(\Omega)\subseteq B_{p,q}^{{s-\frac{d}{\eta}+\frac{d}{p}}}(\Omega)$ and
\begin{align}\label{B2}
	B\lesssim \sum_{j=j_{2}+1}^{\infty}2^{-js^{\prime}p}\lesssim 2^{-j_{2}(s-\frac{d}{\eta}+\frac{d}{p})p}\sim \left(\frac{\ln n}{n}\right)^{\frac{(s-\frac{d}{\eta}+\frac{d}{p})p}{d}}\leq \left(\frac{\ln n}{n}\right)^{\frac{(s-\frac{d}{\eta}+\frac{d}{p})p}{2(s-\frac{d}{\eta})+d}}\leq \left(\frac{\ln n}{n}\right)^{\delta p}.
\end{align}
In conclusion, by ({\color{blue}\ref{B1}}) and ({\color{blue}\ref{B2}}), when $1\leq p<\infty$,
\begin{align}\label{411}
	B\lesssim \left(\frac{\ln n}{n}\right)^{\delta p}.
\end{align}
\par By Lemma \ref{y2.2} and the Hölder inequality, $C$ can be written as  
\begin{align*}
	C&:={\rm{E}}\left[ \left\|  \sum\limits_{j = {j_*}}^{{j_2}} \sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} {\left({{{\hat\beta }_{j,\textit{\textbf{k}},u}}{\mathbb{I}_{\{{|{{{\hat \beta }_{j,\textit{\textbf{k}},u}}}| \ge \kappa {t_n}}\}}} - {\beta _{j,\textit{\textbf{k}},u}}}\right)}{\Psi _{j,\textit{\textbf{k}},u}}(\boldsymbol{x})\right\| ^{p}_{p}\right]\\
	&\lesssim (j_{2}-j_{*}+1)^{p-1}\sum\limits_{j = {j_*}}^{{j_2}}{\rm{E}}\left[\left \|  \sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}{\left({{{\hat\beta }_{j,\textit{\textbf{k}},u}}{\mathbb{I}_{\{{|{{{\hat \beta }_{j,\textit{\textbf{k}},u}}}| \ge \kappa {t_n}}\}}} - {\beta _{j,\textit{\textbf{k}},u}}}\right)}{\Psi _{j,\textit{\textbf{k}},u}} (\boldsymbol{x})\right\|^{p}_{p}\right]\\
	&\lesssim (j_{2}-j_{*}+1)^{p-1}\sum\limits_{j = {j_*}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}\sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}{\rm{E}}\left[\left|{{{{\hat\beta }_{j,\textit{\textbf{k}},u}}{\mathbb{I}_{\{{|{{{\hat \beta }_{j,\textit{\textbf{k}},u}}}| \ge \kappa {t_n}}\}}} - {\beta _{j,\textit{\textbf{k}},u}}}}\right|^{p}\right].
\end{align*}
Note that 
\begin{align*}
	\left|\hat{\beta}_{j,\textit{\textbf{k}},u}\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|\geq\kappa t_{n}\}}-\beta_{j,\textit{\textbf{k}},u}\right|
	\lesssim&\, \left|\hat{\beta}_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|\geq\kappa t_{n}\}}+ \left|\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|<\kappa t_{n}\}}\\
	=&\,\left|\hat{\beta}_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|\geq\kappa t_{n}\},|{\beta}_{j,\textit{\textbf{k}},u}|<\frac{\kappa t_{n}}{2}\}}\\
	&+\left|\hat{\beta}_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|\geq\kappa t_{n}\},|{\beta}_{j,\textit{\textbf{k}},u}|\geq\frac{\kappa t_{n}}{2}\}}\\
	&+\left|\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|<\kappa t_{n}, |{\beta}_{j,\textit{\textbf{k}},u}|> 2\kappa t_{n}\}}\\
	&+\left|\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|<\kappa t_{n}, |{\beta}_{j,\textit{\textbf{k}},u}|\leq 2\kappa t_{n}\}}.
\end{align*}
We will discuss the following four parts:\vskip0.1cm
\noindent\textbf{(1)} $\left|\hat{\beta}_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|\geq\kappa t_{n},  |{\beta}_{j,\textit{\textbf{k}},u}|<\frac{\kappa t_{n}}{2}\}},$\vskip0.2cm

\noindent\textbf{(2)} $ \left|\hat{\beta}_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|\geq\kappa t_{n}, |{\beta}_{j,\textit{\textbf{k}},u}|\geq\frac{\kappa t_{n}}{2}\}},$\vskip0.2cm

\noindent\textbf{(3)} $\left|\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|<\kappa t_{n}, |{\beta}_{j,\textit{\textbf{k}},u}|> 2\kappa t_{n}\}},$\vskip0.2cm

\noindent\textbf{(4)} $\left|\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|<\kappa t_{n}, |{\beta}_{j,\textit{\textbf{k}},u}|\leq 2\kappa t_{n}\}}.$\vskip0.2cm
For \textbf{(1)}, the conditions $\left|\hat{\beta}_{j,\textit{\textbf{k}},u}\right|\!\geq\!\kappa t_n$ and $|\beta_{j,\textit{\textbf{k}},u}|\!<\!\frac{\kappa t_n}{2}$ imply that $\left|\hat{\beta}_{j,\textit{\textbf{k}},u}\!-\!\beta_{j,\textit{\textbf{k}},u}\right|\!\geq\!\left|\hat{\beta}_{j,\textit{\textbf{k}},u}\right|\!-\!|\beta_{j,\textit{\textbf{k}},u}|\!>\!\frac{\kappa t_n}{2}$ and 
\begin{align*}
	\left\{\left|\hat{\beta}_{j,\textit{\textbf{k}},u}\right|\geq\kappa t_n,|\beta_{j,\textit{\textbf{k}},u}|<\frac{\kappa t_n}{2}\right\}\subseteq\left\{\left|\hat{\beta}_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\right|>\frac{\kappa t_n}{2}\right\}.
\end{align*} 
Therefore, we can get
\begin{gather*}
	\left|\hat{\beta}_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|\geq\kappa t_{n},|{\beta}_{j,\textit{\textbf{k}},u}|<\frac{\kappa t_{n}}{2}\}}\leq \left|\hat{\beta}_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\left\{\left|\hat{\beta}_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\right|>\frac{\kappa t_n}{2}\right\}}.
\end{gather*} 
For \textbf{(2)},  since $\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|\geq\kappa t_{n},|{\beta}_{j,\textit{\textbf{k}},u}|\geq\frac{\kappa t_{n}}{2}\}}\leq\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\geq\frac{\kappa t_{n}}{2}\}}$,
we can get
\begin{gather*}
	\left|\hat{\beta}_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|\geq\kappa t_{n},|{\beta}_{j,\textit{\textbf{k}},u}|\geq\frac{\kappa t_{n}}{2}\}}\leq\left|\hat{\beta}_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\geq\frac{\kappa t_{n}}{2}\}}.
\end{gather*} 
For \textbf{(4)}, due to $\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|<\kappa t_{n}, |{\beta}_{j,\textit{\textbf{k}},u}|\leq 2\kappa t_{n}\}}\leq \mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\leq 2\kappa t_{n}\}}$, it is easy to see that
\begin{gather*}
	\left|\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|<\kappa t_{n}, |{\beta}_{j,\textit{\textbf{k}},u}|\leq 2\kappa t_{n}\}}\leq \left|\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\leq 2\kappa t_{n}\}}.
\end{gather*} 
For \textbf{(3)}, note that $\left|\hat{\beta}_{j,\textit{\textbf{k}},u}\right|<\kappa t_{n}$ and  $|{\beta}_{j,\textit{\textbf{k}},u}|> 2\kappa t_{n}$; we see that $\left|\hat{\beta}_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\right|>\left|\beta_{j,\textit{\textbf{k}},u}\right|-\left|\hat{\beta}_{j,\textit{\textbf{k}},u}\right|>\kappa t_{n}$,
\begin{align*}
	\left|\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|<\kappa t_{n}, |{\beta}_{j,\textit{\textbf{k}},u}|> 2\kappa t_{n}\}}&\leq \left( \left| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \right|+\left|\hat{\beta}_{j,\textit{\textbf{k}},u}\right|
	\right) \mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|<\kappa t_{n},|{\beta}_{j,\textit{\textbf{k}},u}|> 2\kappa t_{n}\}}\\
	&< \left( \left| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \right|+\left|\hat{\beta}_{j,\textit{\textbf{k}},u}\right|
	\right) \mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u}|> \kappa t_{n}\}}\\
	&< \left( \left| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \right|+\left|2\kappa t_{n}\right|
	\right) \mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u}|> \kappa t_{n}\}}\\
	&\lesssim \left| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \right|
	\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u}|> \frac{\kappa t_{n}}{2}\}}.
\end{align*}
Therefore, we know that
\begin{gather*}
	\left|\beta_{j,\textit{\textbf{k}},u}\right|\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|<\kappa t_{n}, |{\beta}_{j,\textit{\textbf{k}},u}|> 2\kappa t_{n}\}}\lesssim  \left| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \right|
	\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u}|> \frac{\kappa t_{n}}{2}\}}.
\end{gather*}
In line with these discussions, we obtain
\begin{align*}
	\left|\hat{\beta}_{j,\textit{\textbf{k}},u}\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}|\geq\kappa t_{n}\}}-\beta_{j,\textit{\textbf{k}},u}\right|^{p}\lesssim&\,  \big| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \big|^p
	\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u}|> \frac{\kappa t_{n}}{2}\}}\\
	&+ \big| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \big|^p
	\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\geq \frac{\kappa t_{n}}{2}\}}+\left|\beta_{j,\textit{\textbf{k}},u}\right|^p\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\leq 2\kappa t_{n}\}}.
\end{align*}
Therefore, $C$ can be decomposed into three parts:
\begin{align}\label{c0}
	C\lesssim (j_{2}-j_{*}+1)^{p-1}(C_1+C_2+C_3),
\end{align}
where
\begin{align*}
	C_{1}=&\,\sum\limits_{j = {j_*}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p} \sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}{\rm{E}} \left[\big| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \big|^p
	\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u}|> \frac{\kappa t_{n}}{2}\}}\right],\\
	C_{2}=&\,\sum\limits_{j = {j_*}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p} \sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}{\rm{E}} \left[\big| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \big|^p
	\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\geq \frac{\kappa t_{n}}{2}\}}\right],\\
	C_{3}=&\,\sum\limits_{j = {j_*}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p} \sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} \left|{\beta}_{j,\textit{\textbf{k}},u} \right|^p
	\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\leq 2\kappa t_{n}\}}.
\end{align*}
For $C_{1}$, it follows from the Hölder inequality that 
\begin{align*}
	{\rm{E}} \left[\big| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \big|^p
	\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u}|> \frac{\kappa t_{n}}{2}\}}\right]
	&\leq \left\{{\rm{E}}\Big[\big|\hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u}\big|^{2p}\Big]\right\}^{\frac{1}{2}}\Big\{{\rm{E}}\big[\mathbb{I}_{\{|\hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u}|> \frac{\kappa t_{n}}{2}\}}\big]  \Big\}^{\frac{1}{2}}\\
	&\leq \left\{{\rm{E}}\Big[\big|\hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u}\big|^{2p}\Big]\right\}^{\frac{1}{2}}\Big\{{\rm Pr}\big(\big|\hat\beta_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}\big|> \frac{\kappa t_{n}}{2}  \big)\Big\}^{\frac{1}{2}}.
\end{align*}
Based on Lemma \ref{b2}, Lemma \ref{e1}, and $|\hat\beta_{j,\textit{\textbf{k}},u}-\beta_{j,\textit{\textbf{k}},u}|^p\lesssim \left(\frac{n}{\ln n}\right)^{\frac{p}{2}}$, we get
\begin{align*}
	{\rm{E}} \left[\big| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \big|^{2p}\right]&\leq {\rm{E}} \left[\big| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \big|^p\big| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \big|^p
	\right]\\
	&\lesssim 	\left(\frac{n}{\ln n}\right)^{\frac{p}{2}}{\rm{E}} \left[\big| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \big|^p
	\right]\\
	&\lesssim 	\left(\frac{n}{\ln n}	\right)^\frac{p}{2}n^{-\frac{p}{2}}\lesssim (\ln n)^{-\frac{p}{2}}.
\end{align*}
Furthermore, since $\left|\Lambda_{j_*}\right|\sim 2^{j_{*}d}$ and $2^{j_{2}}\sim \left(\frac{n}{\ln n}\right)^{\frac{1}{d}}$,
\begin{equation}
	\begin{split}
	C_{1}&\lesssim \sum\limits_{j = {j_*}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p} \sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}(\ln n)^{-\frac{p}{4}}n^{-p}\lesssim \sum\limits_{j = {j_*}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p} 2^{jd}(\ln n)^{-\frac{p}{4}}n^{-p}\\
	&\lesssim 2^{j_2(\frac{d}{2}-\frac{d}{p})p+j_2d}(\ln n)^{-\frac{p}{4}}n^{-p}\lesssim (\ln n)^{-\frac{3p}{4}}n^{-\frac{p}{2}}\\
	&\leq \left(\frac{\ln n}{n}\right)^\frac{p}{2}\leq \left(\frac{\ln n}{n}\right)^{\delta p}.
	\end{split}
	\label{c1}
\end{equation}
For $C_{2}$, one defines $2^{j'}\sim n^{\frac{1}{2s+d}} $, we obtain $2^{j_{*}}\sim n^{\frac{1}{2m+d}}(m>s) \leq2^{j'}\sim n^{\frac{1}{2s+d}}\leq 2^{j_{2}}\sim \left( \frac{n}{\ln n}\right)^{\frac{1}{d}}$, and $C_{2}$ can be defined as
\begin{equation}
	\begin{split}
	C_{2}
	&=\left(\sum\limits_{j = {j_*}}^{{j'}}+\sum\limits_{j = {j'+1}}^{{j_2}}\right)2^{j(\frac{d}{2}-\frac{d}{p})p}\sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}{\rm{E}} \left[\big| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \big|^p
	\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\geq \frac{\kappa t_{n}}{2}\}}\right]\\
	&:=C_{21}+C_{22}.
	\end{split}
	\label{c21}
\end{equation}
From Lemma \ref{b2} and $2^{j'}\sim n^{\frac{1}{2s+d}}$,
\begin{equation}
	\begin{split}
	C_{21}&:=\sum\limits_{j = {j_*}}^{{j'}}2^{j(\frac{d}{2}-\frac{d}{p})p}\sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}{\rm{E}} \left[\big| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \big|^p
	\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\geq \frac{\kappa t_{n}}{2}\}}\right]\\
	&\lesssim \sum\limits_{j = {j_*}}^{{j'}}2^{j(\frac{d}{2}-\frac{d}{p})p}\sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} n^{-\frac{p}{2}}\lesssim \sum\limits_{j = {j_*}}^{{j'}} 2^{j(\frac{d}{2}-\frac{d}{p})p}\cdot2^{jd}\cdot n^{-\frac{p}{2}}\\
	&\lesssim 2^{\frac{j^{\prime}pd}{2}}\cdot n^{-\frac{p}{2}}\sim n^{-\frac{sp}{2s+d}}\leq n^{-\delta p}.
	\end{split}
	\label{c22}
\end{equation}
For $C_{22}$, when $p\geq 1$, using Lemma \ref{b2}, one obtains 
\begin{align*}
	C_{22}&:=\sum\limits_{j = {j'+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}
	\sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}{\rm{E}} \left[\big| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \big|^p
	\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\geq \frac{\kappa t_{n}}{2}\}}\right]\\
	&\lesssim n^{-\frac{p}{2}}\sum\limits_{j = {j_*}}^{{j'}}2^{j(\frac{d}{2}-\frac{d}{p})p}\sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}
	\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\geq \frac{\kappa t_{n}}{2}\}}.	
\end{align*}	
When $1\leq p<\eta$, by the Hölder inequality, Lemma \ref{b2}, $t_{n}=\sqrt{\frac{\ln n}{n}}$, and $2^{j'}\sim n^{\frac{1}{2s+d}}$, we get
\begin{equation}
	\begin{split}
	C_{22}&\lesssim  \ n^{-\frac{p}{2}}\sum\limits_{j = {j'+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}\sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}\left(\frac{|{\beta}_{j,\textit{\textbf{k}},u}|}{ \kappa t_{n}/2}\right)^p\lesssim(\ln n)^{-\frac{p}{2}}\sum\limits_{j = {j'+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}\|\beta_{j,\textit{\textbf{k}},u}\|_{p}^{p}\\
	&\leq \sum\limits_{j = {j'+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}  2^{jd(1-\frac{d}{\eta})p}  \|\beta_{j,\textit{\textbf{k}},u}\|_{\eta}^{p}\leq \sum\limits_{j = {j'+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}  2^{jd(1-\frac{d}{\eta})p}   2^{-j(s-\frac{d}{\eta}+\frac{d}{2})p}\\
	&=\sum\limits_{j = {j'+1}}^{{j_2}}2^{-jps}
	\lesssim 2^{-j^{\prime}ps}  \sim  n^{-\frac{sp}{2s+d}}\leq n^{-\delta p}.
	\end{split}
	\label{c221}
\end{equation}
When $1\leq \eta<p$ and $s>\frac{d}{\eta}$,
\begin{align*}
	C_{22}&\lesssim n^{-\frac{p}{2}}\sum\limits_{j = {j'+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}\sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}\left(\frac{|{\beta}_{j,\textit{\textbf{k}},u}|}{ \kappa t_{n}/2}\right)^{\eta}	\notag\\
	&\lesssim (\ln n)^{-\frac{\eta}{2}} n^{\frac{\eta-p}{2}} \sum\limits_{j = {j'+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}\|\beta_{j,\textit{\textbf{k}},u}\|_{\eta}^{\eta}\notag\\
	&\leq (\ln n)^{-\frac{\eta}{2}} n^{\frac{\eta-p}{2}} \sum\limits_{j = {j'+1}}^{{j_2}}2^{-j( s\eta-\frac{d}{2}(p-\eta))}.
\end{align*}
Let $\varepsilon := s\eta-\frac{d}{2}(p-\eta)$. Then $C_{22}$ can be written as
\begin{align*}
	C_{22}&\lesssim (\ln n)^{-\frac{\eta}{2}} n^{\frac{\eta-p}{2}} \sum\limits_{j = {j'+1}}^{{j_2}}2^{-j\varepsilon}.
\end{align*}
When $\varepsilon>0$, if and only if $\eta\leq\frac{dp}{2s+d}
$, then $\delta=\frac{s}{2s+d}$ and
\begin{align}\label{c222}
	C_{22}&\lesssim (\ln n)^{-\frac{\eta}{2}} n^{\frac{\eta-p}{2}}2^{-j^{\prime}\varepsilon}\lesssim  (\ln n)^{-\frac{\eta}{2}} n^{-\frac{sp}{2s+d}}\leq n^{-\frac{sp}{2s+d}}\leq n^{-\delta p}.
\end{align}
When $\varepsilon\leq 0$, if and only if $\eta\leq\frac{dp}{2s+d}
$, then we have $\delta=\frac{s-\frac{d}{\eta}+\frac{d}{p}}{2(s-\frac{d}{\eta})+d}$. We take $2^{j_1}\sim \frac{1}{2(s-\frac{d}{\eta})+d}$. Obviously, $2^{j'}\sim n^{\frac{1}{2s+d}}<2^{j_1}\sim \frac{1}{2(s-\frac{d}{\eta})+d}< 2^{j_{2}}\sim \left( \frac{n}{\ln n}\right)^{\frac{1}{d}}$, $C_{22}$ can be written as
\begin{align*}
	C_{22}&:=\left(\sum\limits_{j = {j'+1}}^{{j_1}}+\sum\limits_{j = {j_1+1}}^{{j_2}}\right)2^{j(\frac{d}{2}-\frac{d}{p})p}\sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}{\rm{E}} \left[\big| \hat{\beta}_{j,\textit{\textbf{k}},u}-{\beta}_{j,\textit{\textbf{k}},u} \big|^p\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\geq \frac{\kappa t_{n}}{2}\}}\right]\notag\\
	&:=C_{221}+C_{222}.
\end{align*}
For $C_{221}$, note that $\frac{\eta-p}{2}-\frac{\delta\varepsilon}{s-\frac{d}{\eta}+\frac{d}{p}}=-\delta p$ and
\begin{equation}
	\begin{split}
	C_{221}&\lesssim (\ln n)^{-\frac{\eta}{2}} n^{\frac{\eta-p}{2}} \sum\limits_{j = {j'+1}}^{{j_1}}2^{-j\varepsilon}\lesssim  (\ln n)^{-\frac{\eta}{2}} n^{\frac{\eta-p}{2}} 2^{-j_1\varepsilon}\\
	&\lesssim  (\ln n)^{-\frac{\eta}{2}} n^{\frac{\eta-p}{2}} n^{-\frac{\delta\varepsilon}{s-\frac{d}{\eta}+\frac{d}{p}}}= (\ln n)^{-\frac{\eta}{2}} n^{-\delta p} \leq n^{-\delta p} .
	\end{split}
	\label{c223}
\end{equation}
For $C_{222}$, when $1\leq \eta\leq p$ and $s>\frac{d}{\eta}$,  $B_{\eta,q}^{s}(\Omega)\subseteq B_{p,q}^{{s-\frac{d}{\eta}+\frac{d}{p}}}(\Omega)$. Furthermore, we can get 
\begin{equation}
	\begin{split}
	C_{222}&\lesssim  n^{-\frac{p}{2}}\sum\limits_{j = {j_1+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}\sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}\left(\frac{|{\beta}_{j,\textit{\textbf{k}},u}|}{ \kappa t_{n}/2}\right)^{p}\lesssim  (\ln n)^{-\frac{p}{2}}\sum\limits_{j = {j_1+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}\|{\beta}_{j,\textit{\textbf{k}},u}\|_{p}^{p}\\
	&\leq  (\ln n)^{-\frac{p}{2}}\sum\limits_{j = {j_1+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}2^{-j(s-\frac{d}{\eta}+\frac{d}{2})p}\lesssim  (\ln n)^{-\frac{p}{2}}2^{-j_1(s-\frac{d}{\eta}+\frac{d}{p})p}\\
	&\sim  (\ln n)^{-\frac{p}{2}}n^{-\frac{(s-\frac{d}{\eta}+\frac{d}{p})p}{2(s-\frac{d}{\eta})+d}}\leq  (\ln n)^{-\frac{p}{2}}n^{-\delta p}\leq n^{-\delta p}.
	\end{split}
	\label{c224}
\end{equation}
By combining ({\color{blue}\ref{c221}}), ({\color{blue}\ref{c222}}), ({\color{blue}\ref{c223}}), and ({\color{blue}\ref{c224}}), we get 	
\begin{align}\label{c23}
	C_{22}\lesssim  n^{-\delta p}.
\end{align} 
This, together with ({\color{blue}\ref{c21}}), ({\color{blue}\ref{c22}}), and ({\color{blue}\ref{c23}}), shows that          
\begin{align}\label{c2}
	C_{2}\lesssim  n^{-\delta p}.
\end{align}  
Finally, $C_{3}$ could be rewritten as
\begin{align*}
	C_{3}&=\left(\sum\limits_{j = {j_*}}^{{j'}}+\sum\limits_{j = {j'+1}}^{{j_2}}\right)2^{j(\frac{d}{2}-\frac{d}{p})p} \sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} \left|{\beta}_{j,\textit{\textbf{k}},u} \right|^p
	\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\leq 2\kappa t_{n}\}}\notag\\
	&:=C_{31}+C_{32}.	
\end{align*} 
For $C_{31}$, 
\begin{align*}
	C_{31}&=\sum\limits_{j = {j_*}}^{{j'}}2^{j(\frac{d}{2}-\frac{d}{p})p} \sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} \left|{\beta}_{j,\textit{\textbf{k}},u} \right|^p\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\leq 2\kappa t_{n}\}}\leq \sum\limits_{j = {j_*}}^{{j'}}2^{j(\frac{d}{2}-\frac{d}{p})p} \sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} \left|2\kappa t_{n} \right|^p\notag\\
	&\leq \sum\limits_{j = {j_*}}^{{j'}}2^{j(\frac{d}{2}-\frac{d}{p})p} 2^{jd}\left(\frac{\ln n}{n}\right)^{\frac{p}{2}}\lesssim  \left(\frac{\ln n}{n}\right)^{\frac{p}{2}} 2^{\frac{j'pd}{2}} \notag\\
	&\lesssim (\ln n)^{\frac{p}{2}} n^{-\frac{sp}{2s+d}}\leq (\ln n)^{\frac{p}{2}}n^{-\delta p}.
\end{align*} 	
For $C_{32}$, when $1\leq p<\eta$, by the Hölder inequality, Lemma \ref{y}, $t_{n}=\sqrt{\frac{\ln n}{n}}$, and $2^{j'}\sim n^{\frac{1}{2s+d}}$,
\begin{align*}
	C_{32}&\leq \sum\limits_{j = {j'+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p} \sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} \left|{\beta}_{j,\textit{\textbf{k}},u} \right|^p=	\sum\limits_{j = {j'+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}\|\beta_{j,\textit{\textbf{k}},u}\|_{p}^{p}\notag \\&\leq \sum\limits_{j = {j'+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}  2^{jd(1-\frac{d}{\eta})p}  \|\beta_{j,\textit{\textbf{k}},u}\|_{\eta}^{p}\leq \sum\limits_{j = {j'+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}  2^{jd(1-\frac{d}{\eta})p}   2^{-j(s-\frac{d}{\eta}+\frac{d}{2})p}\notag\\
	&\lesssim \sum\limits_{j = {j'+1}}^{{j_2}}2^{-jps}\lesssim 2^{-j'ps}\sim n^{-\frac{sp}{2s+d}}\leq n^{-\delta p}.
\end{align*} 
When $1\leq \eta\leq p$, one has
\begin{align*}
	C_{32}&\leq \sum\limits_{j = {j'+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p} \sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} \left|{\beta}_{j,\textit{\textbf{k}},u} \right|^p\left(\frac{2\kappa t_{n}}{\left|{\beta}_{j,\textit{\textbf{k}},u} \right|}\right)^{p-{\eta}}\lesssim \left(\frac{\ln n}{n}\right)^{\frac{p-{\eta}}{2}}\sum\limits_{j = {j'+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p} \|\beta_{j,\textit{\textbf{k}},u}\|_{\eta}^{\eta}\notag\\
	&\lesssim \left(\frac{\ln n}{n}\right)^{\frac{p-{\eta}}{2}}\sum\limits_{j = {j'+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p} 2^{-j{\eta}(s-\frac{d}{\eta}+\frac{d}{2})}\leq \left(\frac{\ln n}{n}\right)^{\frac{p-{\eta}}{2}}\sum\limits_{j = {j'+1}}^{{j_2}}2^{-j( s\eta-\frac{d}{2}(p-\eta))}\notag	\\
	&=\left(\frac{\ln n}{n}\right)^{\frac{p-{\eta}}{2}}\sum\limits_{j = {j'+1}}^{{j_2}}2^{-j\varepsilon}	.
\end{align*} 
When $\varepsilon>0$, $\eta\leq\frac{dp}{2s+d}
$ and $\delta=\frac{s}{2s+d}$. Then
\begin{align*}
	C_{32}&\lesssim \left(\frac{\ln n}{n}\right)^{\frac{p-{\eta}}{2}}2^{-j'\varepsilon}\lesssim \left(\frac{\ln n}{n}\right)^{\frac{p-{\eta}}{2}}2^{-j'( s\eta-\frac{d}{2}(p-\eta))}\notag\\
	&\lesssim (\ln n)^{\frac{p-{\eta}}{2}}n^{\frac{{\eta}-p}{2}}n^{- \frac{s\eta-\frac{d}{2}(p-\eta)}{2s+d}}\leq (\ln n)^{\frac{p}{2}}n^{-\frac{sp}{2s+d}}\leq (\ln n)^{\frac{p}{2}}n^{-\delta p}.
\end{align*} 
When $\varepsilon\leq 0$, $\eta\leq\frac{dp}{2s+d}
$ and $\delta=\frac{s-\frac{d}{\eta}+\frac{d}{p}}{2(s-\frac{d}{\eta})+d}$. One defines $2^{j_1}\sim \frac{1}{2(s-\frac{d}{\eta})+d}$; obviously, $2^{j'}\sim n^{\frac{1}{2s+d}}<2^{j_1}\sim \frac{1}{2(s-\frac{d}{\eta})+d}< 2^{j_{2}}\sim \left( \frac{n}{\ln n}\right)^{\frac{1}{d}}$, and $C_{32}$ can be written as
\begin{align*}
	C_{32}&=\left(\sum\limits_{j = {j'+1}}^{{j_1}}+\sum\limits_{j = {j_1+1}}^{{j_2}}\right)2^{j(\frac{d}{2}-\frac{d}{p})p}\sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}} \left|{\beta}_{j,\textit{\textbf{k}},u} \right|^p\mathbb{I}_{\{|{\beta}_{j,\textit{\textbf{k}},u}|\leq 2\kappa t_{n}\}}\notag\\
	&:=C_{321}+C_{322}.
\end{align*}
For $C_{321}$, 
\begin{align*}
	C_{321}	&\lesssim \left(\frac{\ln n}{n}\right)^{\frac{p-{\eta}}{2}}\sum\limits_{j = {j'+1}}^{{j_1}}2^{-j\varepsilon}\lesssim \left(\frac{\ln n}{n}\right)^{\frac{p-{\eta}}{2}}2^{-j_1\varepsilon}\sim (\ln n)^{\frac{p}{2}}n^{-\delta p}.
\end{align*}
For $C_{322}$, when $1\leq \eta\leq p$ and $s>\frac{d}{\eta}$,  $B_{\eta,q}^{s}(\Omega)\subseteq B_{p,q}^{{s-\frac{d}{\eta}+\frac{d}{p}}}(\Omega)$, and
\begin{align*}
	C_{322}&\lesssim  \sum\limits_{j = {j_1+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}\sum\limits_{u = {1}}^{{2^d-1}}  \sum\limits_{\textit{\textbf{k}} \in {\Lambda _j}}|{\beta}_{j,\textit{\textbf{k}},u}|^{p}\lesssim \sum\limits_{j = {j_1+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}\|{\beta}_{j,\textit{\textbf{k}},u}\|_{p}^{p}	\notag\\
	&\lesssim  \sum\limits_{j = {j_1+1}}^{{j_2}}2^{j(\frac{d}{2}-\frac{d}{p})p}2^{-j(s-\frac{d}{\eta}+\frac{d}{2})p}\leq \sum\limits_{j = {j_1+1}}^{{j_2}}2^{-jp(s-\frac{d}{\eta}+\frac{d}{p})}\notag\\
	&\lesssim 2^{-j_1p(s-\frac{d}{\eta}+\frac{d}{p})}\leq n^{-\delta p}.
\end{align*}
Therefore, in all cases,
\begin{align}\label{c3}
	C_{3} \lesssim  (\ln n)^{\frac{p}{2}}n^{-\delta p}.
\end{align}
Combining the above results ({\color{blue}\ref{c0}}), ({\color{blue}\ref{c1}}),({\color{blue}\ref{c2}}), and ({\color{blue}\ref{c3}}), we have
\begin{align*}
	C\lesssim (j_{2}-j_{*}+1)^{p-1}(C_1+C_2+C_3)\lesssim  (\ln n)^{\frac{3p}{2}}n^{-\delta p}.
\end{align*}
This, together with ({\color{blue}\ref{47}}), ({\color{blue}\ref{48}}), and ({\color{blue}\ref{411}}), shows that
\begin{align*}
	{\rm{E}}\big[\big\| \hat{f}_{n}^{\rm non}(\boldsymbol{x})-f(\boldsymbol{x})
	\big\|_{p}^{p}\big]\lesssim (\ln n)^{\frac{3p}{2}}n^{-\delta p},
\end{align*}
which is the desired conclusion.	
\end{proof}
	\section{Numerical simulations}\label{S5}
\quad To assess the empirical performance of the proposed wavelet-based estimators, we conduct a series of numerical simulations. Specifically, we consider the estimation problem defined in ({\color{blue}\ref{1.1}}), where the goal is to reconstruct the unknown probability density function $ f(x)$ from a set of observed data points $ \{X_i\}_{i=1}^n$. In order to evaluate the performance of the estimators, we employed the  $\text{MSE}(f(X_i),\hat{f}(X_i))= \frac{1}{n}\sum_{i=1}^{n}(f(X_i)-\hat{f}(X_i))^2$ as the assessment criterion. For the linear estimator $ \hat{f}_n^{\rm lin}(\boldsymbol{x})$, the wavelet scale parameter $ j_*$ is selected from the discrete set $ j_* \in \{0, 1, \dots, \log_2(n) - 1\}$. The optimal choice of $ j_*$ is obtained by minimizing the mean squared error ($\text{MSE}(f(X_i),\hat{f}^{lin}_{n}(X_i))$). For the nonlinear wavelet estimator,  the scale paremeter $ j_2$ is fixed at the maximum level of wavelet decomposition $ j_2 = \log_2(n) - 1$. Similar to the choice of $ j_*$, the other optimal threshold parameter $ \lambda = \kappa t_n$ is selected by minimizing the MSE ($\text{MSE}(f(X_i),\hat{f}^{non}_{n}(X_i))$). The simulation experiments are done using RStudio software. In addition, the Daubechies compactly
supported wavelet with 8 vanishing moments is used in the following studies.

In the simulation study of model ({\color{blue}\ref{1.1}}), we choose $n=4096$ and the proportionality coefficient $\theta = 0.1$. Six functions are selected as $f(x)$. As shown in Figures 1–6, both linear and nonlinear wavelet estimators have excellent performance. 
Table 1 presents the MSE results for both linear and nonlinear wavelet estimators across various experimental settings. The simulation study indicates that these estimators are capable of effectively approximating the unknown density function $f(x)$. Notably, the nonlinear wavelet estimator exhibits superior performance compared to its linear counterpart.

\begin{example}
%\textbf{Example 1.} 
In  model ({\color{blue}\ref{1.1}}), we consider two density functions: $f_1(x)=2.38x^2e^{-x^2}$ and \linebreak $h(x)=0.15x^2+0.6\cos{2x}+0.15$ defined on the interval  $x\in[0, 2]$. As depicted in Figure \ref{fig:1}\subref{fig:1a}, the optimal scale parameter is determined to be $j_* = 5$, and then the corresponding $MSE(\hat{f}_n^{\mathrm{lin}},f )= 0.01949769$.
Based on Figure \ref{fig:1}\subref{fig:1c}, we find the optimal threshold parameter to be $\lambda = 0.0478260870$. The performance of both linear and nonlinear wavelet estimators is visualized in Figures \ref{fig:1}\subref{fig:1b} and \ref{fig:1}\subref{fig:1d}, respectively. These results clearly demonstrate the effectiveness of the proposed wavelet-based estimators in approximating the target density function.
\begin{figure}[h]
	\centering
	% 第一行
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{M1.png}
		\caption{}
		\label{fig:1a}
	\end{subfigure}
	\hfill
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{f1.png}
		\caption{}
		\label{fig:1b}  % ← 这是子图 (b) 的标签
	\end{subfigure}
	
%	\vspace{0.5cm}
	
	% 第二行
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{lam1.png}
		\caption{}
		\label{fig:1c}
	\end{subfigure}
	\hfill
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{n1.png}
		\caption{}
		\label{fig:1d}
	\end{subfigure}
		\caption{The estimates of $f_{1}(x)$:  (a) $MSE(\hat{f}_n^{\mathrm{lin}},f )$ with a different scale parameter $j_{*}$; (b) the linear wavelet estimate $\hat{f}_n^{\mathrm{lin}}(\textit{\textbf{x}})$;   (c) $MSE(\hat{f}_n^{\mathrm{non}},f )$ with a different thresholding parameter $\lambda$; (d) the nonlinear wavelet estimate $\hat{f}_n^{\mathrm{non}}(\textit{\textbf{x}})$.}
	\label{fig:1}  % ← 总图的标签
\end{figure}
\end{example}

%\newpage
\begin{example}
%\textbf{Example 2.} 
We choose the density functions $f_2(x)=(x-{\mathbb{I}_{\{x\leq 0.3\}}})^2+0.9$, $h(x)=0.5x^2+0.5\cos{2x}+0.53$, $x\in[0, 1]$. The optimal scale parameter $j_* = 10$ can be obtained from Figure \ref{fig:2}\subref{fig:2a}. Figure~\ref{fig:2}\subref{fig:2c} identifies the optimal threshold parameter as $\lambda = 0.11391304$. The estimation results for the linear wavelet estimator are depicted in Figure \ref{fig:2}\subref{fig:2b}. The corresponding results for the nonlinear wavelet estimator are shown in Figure \ref{fig:2}\subref{fig:2d}. Comparative analysis reveals that, when compared to the linear estimator, the nonlinear wavelet estimator demonstrates superior performance, particularly at discontinuity points.
\begin{figure}[h]
	\centering
	% 第一行
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{M2.png}
		\caption{}
		\label{fig:2a}
	\end{subfigure}
	\hfill
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{f2.png}
		\caption{}
		\label{fig:2b}  % ← 这是子图 (b) 的标签
	\end{subfigure}
	
	%	\vspace{0.5cm}
	
	% 第二行
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{lam2.png}
		\caption{}
		\label{fig:2c}
	\end{subfigure}
	\hfill
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{n2.png}
		\caption{}
		\label{fig:2d}
	\end{subfigure}
\caption{The estimates of  $f_{2}(x)$.}
	\label{fig:2}  % ← 总图的标签
\end{figure}
\end{example}
\newpage
\begin{example}
%\textbf{Example 3.} 
In the context of model ({\color{blue}\ref{1.1}}), we consider two density functions
$f_3(x)=1.6e^{-200(x-0.75)^2}+2e^{-50(x-0.33)^2}$ and $h(x)=0.9x^2+0.35\cos{2x}+0.55$, with $x\in[0, 1]$. The optimal scale parameter $j_* = 6$ can be obtained from Figure \ref{fig:3}\subref{fig:3a}. Furthermore, the optimal threshold parameter is determined to be $\lambda = 0.0030100334$, as shown in Figure \ref{fig:3}\subref{fig:3c}. Figures \ref{fig:3}\subref{fig:3b} and \ref{fig:3}\subref{fig:3d} show that the wavelet estimators employed are capable of effectively approximating the unknown density function.
\begin{figure}[h]
	\centering
	% 第一行
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{M4.png}
		\caption{}
		\label{fig:3a}
	\end{subfigure}
	\hfill
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{f4.png}
		\caption{}
		\label{fig:3b}  % ← 这是子图 (b) 的标签
	\end{subfigure}
	
	%	\vspace{0.5cm}
	
	% 第二行
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{lam4.png}
		\caption{}
		\label{fig:3c}
	\end{subfigure}
	\hfill
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{n4.png}
		\caption{}
		\label{fig:3d}
	\end{subfigure}
	\caption{The estimates of  $f_{3}(x)$.}
	\label{fig:3}  % ← 总图的标签
\end{figure}
\end{example}

\newpage
\begin{example}
%\textbf{Example 4.} 
We choose the density functions $f_4(x)=e^{-x^2}\sin^2{x}$ and $h(x)=0.1x^2+0.6\cos{2x}+0.11$, $x\in[-2.5, 2.5]$. The optimal scale parameter $j_* = 5$ can be obtained from Figure \ref{fig:4}\subref{fig:4a}. The optimal threshold parameter, denoted as $\lambda$, is determined to be 0.00301003344, as depicted in Figure \ref{fig:4}\subref{fig:4c}. The corresponding estimation results are displayed in Figure \ref{fig:4}\subref{fig:4b} for the linear wavelet estimator, and in Figure \ref{fig:4}\subref{fig:4d} for the nonlinear wavelet estimator.
\begin{figure}[h]
	\centering
	% 第一行
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{M7.png}
		\caption{}
		\label{fig:4a}
	\end{subfigure}
	\hfill
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{f7.png}
		\caption{}
		\label{fig:4b}  % ← 这是子图 (b) 的标签
	\end{subfigure}
	
	%	\vspace{0.5cm}
	
	% 第二行
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{lam7.png}
		\caption{}
		\label{fig:4c}
	\end{subfigure}
	\hfill
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{n7.png}
		\caption{}
		\label{fig:4d}
	\end{subfigure}
	\caption{The estimates of  $f_{4}(x)$.}
	\label{fig:4}  % ← 总图的标签
\end{figure}
\end{example}
\newpage 
\begin{example}
%\textbf{Example 5.} 
In the context of model ({\color{blue}\ref{1.1}}), we choose the density functions $f_5(x)=(4\sin{4\pi x}-sign(x+0.12)-sign(0.22-x))/5+1.2$ and $h(x)=0.6x^2+0.5\cos{2x}+0.53$, $x\in[-0.5, 0.5]$. Figures \ref{fig:5}\subref{fig:5a} and \ref{fig:5}\subref{fig:5c} demonstrate that the optimal parameters are determined to be $j_* = 8$ for the scale parameter, and $\lambda = 0.0237458194$ for the threshold parameter. Under those optimal parameters, two wavelet estimators have good performance.
\begin{figure}[h]
	\centering
	% 第一行
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{M11.png}
		\caption{}
		\label{fig:5a}
	\end{subfigure}
	\hfill
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{f11.png}
		\caption{}
		\label{fig:5b}  % ← 这是子图 (b) 的标签
	\end{subfigure}
	
	%	\vspace{0.5cm}
	
	% 第二行
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{lam11.png}
		\caption{}
		\label{fig:5c}
	\end{subfigure}
	\hfill
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{n11.png}
		\caption{}
		\label{fig:5d}
	\end{subfigure}
	\caption{The estimates of  $f_{5}(x)$.}
	\label{fig:5}  % ← 总图的标签
\end{figure}
\end{example}
\newpage
\begin{example}
%\textbf{Example 6.} 
We choose the density functions $f_6(x)=8(2-x)e^{-4(2-x)^2}$, $h(x)=0.2x^2+0.35\cos{2x}+0.51$, and $x\in[0.5, 2]$. The optimal scale parameter $j_* = 6$ can be obtained from Figure \ref{fig:6}\subref{fig:6a}. The optimal threshold parameter is $\lambda = 0.0284280936$, as depicted in Figure \ref{fig:6}\subref{fig:6c}. Figures \ref{fig:6}\subref{fig:6b} and \ref{fig:6}\subref{fig:6d} show that both wavelet estimators can effectively approximate the target density function.
\begin{figure}[h]
	\centering
	% 第一行
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{M6.png}
		\caption{}
		\label{fig:6a}
	\end{subfigure}
	\hfill
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{f6.png}
		\caption{}
		\label{fig:6b}  % ← 这是子图 (b) 的标签
	\end{subfigure}
	
	%	\vspace{0.5cm}
	
	% 第二行
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{lam6.png}
		\caption{}
		\label{fig:6c}
	\end{subfigure}
	\hfill
	\begin{subfigure}[t]{0.48\textwidth}
		\centering
		\includegraphics[width=0.9\linewidth]{n6.png}
		\caption{}
		\label{fig:6d}
	\end{subfigure}
	\caption{The estimates of  $f_{6}(x)$.}
	\label{fig:6}  % ← 总图的标签
\end{figure}
\end{example}

\newpage
The following Table \ref{t:1} shows the optimal scale parameters, the optimal threshold parameters, and the MSE of two wavelet estimators in different examples. For more simulation studies, we choose a different sample size $n$ to observe the performance of two wavelet estimators. According to Table \ref{t:2}, it is easy to see that the MSE of two wavelet estimators becomes smaller when the sample size $n$ is larger. In addition, the nonlinear wavelet estimators $\tilde{f}_n$ perform better than the linear wavelet estimators $\hat{f}_n$.
\begin{table}[h]
	\setlength{\tabcolsep}{8.5mm}
	\centering
	\begin{tabular}{cccccccccc}
		\hline
		\multirow{2}{*}{}
		&     $f_1$     &    $f_2$     &     $f_3$    \\
		\midrule
		$j_*$                &  5            & 10           & 6            \\
		$ \lambda $          &  0.0478260870 & 0.11391304 &  0.0030100334 \\ 
		$ MSE(\hat{f}_n^{\mathrm{lin}},f) $   &   0.01949769  & 0.00514771 &   0.03986530 \\
		$ MSE(\hat{f}_n^{\mathrm{non}},f) $ &  0.01466935  & 0.00298714  &0.02302871  \\
		\hline
		\multirow{2}{*}{}
		&    $f_4$     &    $f_5$     &     $f_6$     \\
		\hline
		$j_*$                & 5           & 8            &6             \\
		$ \lambda $          & 0.0237458194    & 0.170568562 &  0.0284280936 \\ 
		$ MSE(\hat{f}_n^{\mathrm{lin}},f) $   &  0.02185451  & 0.02971702  &  0.03902024  \\
		$ MSE(\hat{f}_n^{\mathrm{non}},f) $ & 0.01837104    &   0.02093666    &  0.02811353  \\
		\hline
	\end{tabular}
	\caption{The MSE of the wavelet estimator.}
	\label{t:1}
\end{table}

\begin{table}[h]
	\centering
	\footnotesize
	\setlength{\tabcolsep}{3.5pt}  % 减小列间距
	\renewcommand{\arraystretch}{1.25}  % 调整行高 
	\begin{tabular}{ccccccccccccc}
		\toprule
		\multirow{2}{*}{}
		& $n$		                     &128      &256    & 512   &1024   &2048   &4096  &8192  &16384    \\
		\midrule
		\multirow{2}{*}{1}			& $ MSE(\hat{f}_n,f) $   &0.02004337  & 0.01969370  & 0.01967366   &0.01965352   &0.01963825   &0.01949769    &0.01938861    &0.01934151   \\
		& $ MSE(\tilde{f}_n,f) $ &  0.01531049  & 0.017234  & 0.014139 &0.012880 &0.01479997&0.01466935&0.01466550&0.01464125 \\
		\multirow{2}{*}{2}			& $ MSE(\hat{f}_n,f) $   &0.00560728  &0.00517845 & 0.00516828&0.00516463  &0.00515295 & 0.00514771 & 0.00501177 &0.00500330   \\
		& $ MSE(\tilde{f}_n,f) $ &0.00329194  &0.00326762 & 0.00310774&0.00309442  &0.00301939 & 0.00298714 &0.00297252 &0.00295634 \\
		\multirow{2}{*}{3}			& $ MSE(\hat{f}_n,f) $   & 0.04774499 & 0.04085642& 0.03986695&0.03974946  &0.03953559 & 0.03949926 &0.03949290  &0.03936086  \\
		&$ MSE(\tilde{f}_n,f) $ &0.02323339  &0.02308795 &0.02308036 &0.02307001  & 0.02306140&0.02302871  &0.022824  &0.02281976 \\
		\multirow{2}{*}{4}			&$ MSE(\hat{f}_n,f) $   &0.02285294  &0.02189756 & 0.02189058 &0.02188800  &0.02188289  &0.02185451  &0.02162162  &0.02151109 \\
		& $ MSE(\tilde{f}_n,f) $ &0.01921598  &0.01861709 &0.01860857 &0.01850824  & 0.01839866& 0.01837104 & 0.01827369 &0.01809206 \\
		\multirow{2}{*}{5}			&$ MSE(\hat{f}_n,f) $   &0.036478  &0.03209222 &0.03113655 &0.03061721  &0.02962667 &0.02971702  & 0.02965899 & 0.02965899\\
		& $ MSE(\tilde{f}_n,f) $ &0.02417582  &0.02267460 &0.02257770 &0.02212211  & 0.02151516& 0.02093666 &0.02088335  & 0.02086120\\
		\multirow{2}{*}{6}			& $ MSE(\hat{f}_n,f) $   &0.03968358  &0.03950713 &0.03935347 &0.03927934  & 0.03927852&0.03902024  &0.038165  &0.03792818 \\
		&$ MSE(\tilde{f}_n,f) $ &0.02851510  &0.02843545 &0.028391 & 0.02832763 &0.02821251 &0.02811353  &0.02774405  &0.02690849\\
		\bottomrule
	\end{tabular}
		\caption{Estimation results of two wavelet estimators with a different sample size $n$.}
		\label{t:2}
\end{table}


%%===============================================================%%
%% ACKNOWLEDGEMENTS                                              %%
%% Acknowledgements can be added here.                           %%
%%===============================================================%%
\section*{Acknowledgements}
%%===============================================================%%
The authors would like to thank the reviewers for their helpful and constructive comments that contributed to improving the original version of this paper. This paper is supported by the National Natural Science Foundation of China (No. 12361016), Guangxi Natural Science Foundation (No. 2024GXNSFBA010379), Center for Applied Mathematics of Guangxi (GUET), Guangxi Colleges and Universities Key Laboratory of Data Analysis and Computation.



%%===============================================================%%
%% REFERENCES                                                    %%
%% References should be provided in bibtex file.                 %%
%% We suggest using MR Lookup for finding bibtex entries.        %%
%%===============================================================%%
\bibliography{references}



%%===============================================================%%
%% APPENDICES	                                                 %%
%% Appendices can be added here.                                 %%
%%===============================================================%%
%\normalsize
%\begin{appendices}
%\section{Some Appendix}
%Appendices should be placed at the end of the manuscript, after the references list. 
%\end{appendices}
%%===============================================================%%


\end{document} 

