Source code for qrl.env.core.probability

'''
Implementation of ProbabilityV0 environment

Author: Jay Shah (@Jayshah25)

Contact: jay.shah@qrlqai.com

License: Apache-2.0
'''
from gymnasium import spaces
import pennylane as qml
from pennylane import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import shutil
from ._base import QuantumEnv


[docs]
class ProbabilityV0(QuantumEnv):
    """
    Probability distribution matching environment for variational quantum circuits.

    ``ProbabilityV0`` is a ``gymnasium.Env``-compatible environment that trains a
    parameterized quantum circuit to approximate a target probability distribution
    over computational basis states. The agent optimizes continuous circuit
    parameters so that the measurement statistics of the circuit match a specified
    target distribution.

    This environment is suitable for distribution learning, quantum generative
    modeling, and variational circuit optimization tasks.

    Key properties
    --------------
    - **Action space**: Continuous parameter updates applied to the circuit ansatz.
    - **Observation space**: Probability distribution over ``2**n_qubits`` basis
    states produced by the current circuit.
    - **Reward**: Negative weighted cost combining KL divergence and L2 distance to
    the target distribution, with an additional step penalty.
    - **Termination**: Success when the reward exceeds the specified tolerance or
    truncation at ``max_steps``.

    Visualization
    -------------
    The ``render()`` method animates the evolution of the learned probability
    distribution relative to the target distribution, along with the reward
    trajectory over training steps.

    Input Parameters
    ----------
    n_qubits : int
        Number of qubits in the circuit.
    target_distribution : np.ndarray
        Target probability distribution over computational basis states.
    ansatz : callable or None
        Custom parameterized circuit ansatz. If ``None``, a default RY-based ansatz
        is used.
    max_steps : int
        Maximum number of optimization steps per episode.
    tolerance : float
        Reward threshold for early termination.
    alpha : float
        Weight balancing KL divergence and L2 distance.
    beta : float
        Penalty weight for step count.
    ffmpeg : bool
        Whether to use FFmpeg when saving animations.

    See Also
    --------
    :doc:`tutorials/probability`
        Tutorial on probability distribution learning with variational circuits.

    """
    def __init__(self, 
                 n_qubits: int,
                 target_distribution: np.ndarray,
                 ansatz=None,**kwargs):
        super(ProbabilityV0, self).__init__()

        assert np.isclose(np.sum(target_distribution), 1.0), \
            "Target distribution must sum to 1."
        self.n_qubits = n_qubits
        self.target_distribution = target_distribution
        self.max_steps = kwargs.get("max_steps", 100)
        self.tolerance = kwargs.get("tolerance", -1e3)
        self.alpha = kwargs.get("alpha", 0.5)  # weight for KL vs L2
        self.beta = kwargs.get("beta", 0.01)    # step penalty weight
        ffmpeg = kwargs.get("ffmpeg", False)
        self.render_extension = "mp4" if ffmpeg else "gif"
        self.writer = "ffmpeg" if ffmpeg else "pillow"
        if ffmpeg==True and shutil.which("ffmpeg") is None:
            raise ValueError("ffmpeg not found on system. Please install ffmpeg or set ffmpeg=False")


        # Define PennyLane device
        self.dev = qml.device("default.qubit", wires=self.n_qubits)

        # If no ansatz is provided, define a simple one
        if ansatz is None:
            def default_ansatz(params, wires):
                for i, w in enumerate(wires):
                    qml.RY(params[i], wires=w)
            self.ansatz = default_ansatz
            self.n_params = self.n_qubits
        else:
            self.ansatz = ansatz
            try:
                self.n_params = ansatz.n_params  # If ansatz object has attribute
            except:
                raise ValueError("Please specify ansatz with n_params attribute.")

        # QNode
        @qml.qnode(self.dev)
        def circuit(params):
            self.ansatz(params, wires=range(self.n_qubits))
            return qml.probs(wires=range(self.n_qubits))
        self.circuit = circuit

        # Spaces
        self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(self.n_params,), dtype=np.float32)
        self.observation_space = spaces.Box(low=0, high=1, shape=(2**self.n_qubits,), dtype=np.float32)

        # Internal state
        self.params = np.random.uniform(0, 2*np.pi, size=self.n_params)
        self.current_step = 0
        self.history = []
        self.rewards = []


[docs]
    def get_reward(self, params):
        """
        Compute the reward for a given set of circuit parameters.

        The reward is based on a weighted combination of:
        - Kullback–Leibler (KL) divergence between the target distribution and
        the circuit output distribution.
        - L2 distance between the target and circuit distributions.

        Parameters
        ----------
        params : np.ndarray
            Vector of variational circuit parameters.

        Returns
        -------
        float
            Scalar reward value encouraging the circuit output distribution
            to match the target distribution.
        """
        probs = self.circuit(params)

        # KL divergence (target || probs)
        kl_div = np.sum(self.target_distribution * np.log((self.target_distribution + 1e-10) / (probs + 1e-10)))

        # L2 error
        l2_error = np.linalg.norm(self.target_distribution - probs, ord=2)

        # Reward
        reward = -(self.alpha * kl_div + (1 - self.alpha) * l2_error)

        return -reward



[docs]
    def step(self, action):
        """
        Execute one optimization step.

        Updates the circuit parameters using the provided action, evaluates
        the resulting probability distribution, computes the reward, and
        checks termination conditions.

        Parameters
        ----------
        action : np.ndarray
            Parameter update vector applied additively to the current
            circuit parameters.

        Returns
        -------
        observation : np.ndarray
            Probability distribution over computational basis states produced
            by the circuit after the parameter update.
        reward : float
            Reward value after applying the action.
        done : bool
            True if the episode has terminated due to reaching the reward
            tolerance or the maximum number of steps.
        info : dict
            Empty dictionary provided for compatibility with Gymnasium-style APIs.
        """
        self.params = (self.params + action)  # keep params bounded
        self.current_step += 1

        probs = self.circuit(self.params)
        reward = self.get_reward(self.params)

        done = reward < self.tolerance or self.current_step >= self.max_steps
        self.history.append(probs)
        self.rewards.append(reward)

        return probs, reward, done, {}





[docs]
    def reset(self):
        """
        Reset the environment to a random initial parameter configuration.

        Initializes the circuit parameters randomly, clears episode history,
        and resets the step counter.

        Returns
        -------
        observation : np.ndarray
            Initial circuit parameter vector.
        info : dict
            Empty dictionary provided for compatibility with Gymnasium-style APIs.
        """
        self.params = np.random.uniform(0, 2*np.pi, size=self.n_params)
        self.current_step = 0
        self.history = []
        self.rewards = []
        return self.params, {}



[docs]
    def render(self, save_path_without_extension=None):
        """
        Render the evolution of the probability distribution over training steps.

        The animation shows a bar plot comparing the target probability
        distribution with the circuit's predicted distribution at each step.
        Reward values are displayed in the plot title.

        Parameters
        ----------
        save_path_without_extension : str or None, optional
            Path (without file extension) to save the animation.
            If provided, the animation is saved using the configured writer
            (MP4 for FFmpeg or GIF for Pillow). If None, the animation is
            displayed interactively.

        Returns
        -------
        None
            This method produces a visualization but does not return a value.
        """
        fig, ax = plt.subplots(figsize=(10, 5))
        x = np.arange(2**self.n_qubits)
        width = 0.4

        target_bar = ax.bar(x - 0.2, self.target_distribution, width=width, label="Target")
        current_bar = ax.bar(x + 0.2, self.history[0], width=width, label="Prediction")

        ax.set_ylim(0, 1)
        ax.set_xticks(range(len(self.history[0])))
        ax.set_xticklabels([f"|{i}⟩" for i in range(len(self.history[0]))])
        ax.set_xlabel("Basis states")
        ax.set_ylabel("Probability")
        ax.legend()
        def update(frame):
            probs = self.history[frame]
            for bar, new_height in zip(current_bar, probs):
                bar.set_height(new_height)
            ax.set_title(f"Step {frame} | Reward: {np.array(self.rewards[frame].item()):.4f}")
            return current_bar

        ani = animation.FuncAnimation(fig, update, frames=len(self.history), blit=False)

        if save_path_without_extension:
            ani.save(f"{save_path_without_extension}.{self.render_extension}", writer=self.writer, fps=2)
        else:
            plt.show()