Source code for qrl.env.core.probability

'''
Implementation of ProbabilityV0 environment

Author: Jay Shah (@Jayshah25)

Contact: jay.shah@qrlqai.com

License: Apache-2.0
'''
from gymnasium import spaces
import pennylane as qml
from pennylane import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import shutil
from ._base import QuantumEnv

[docs] class ProbabilityV0(QuantumEnv): """ Probability distribution matching environment for variational quantum circuits. ``ProbabilityV0`` is a ``gymnasium.Env``-compatible environment that trains a parameterized quantum circuit to approximate a target probability distribution over computational basis states. The agent optimizes continuous circuit parameters so that the measurement statistics of the circuit match a specified target distribution. This environment is suitable for distribution learning, quantum generative modeling, and variational circuit optimization tasks. Key properties -------------- - **Action space**: Continuous parameter updates applied to the circuit ansatz. - **Observation space**: Probability distribution over ``2**n_qubits`` basis states produced by the current circuit. - **Reward**: Negative weighted cost combining KL divergence and L2 distance to the target distribution, with an additional step penalty. - **Termination**: Success when the reward exceeds the specified tolerance or truncation at ``max_steps``. Visualization ------------- The ``render()`` method animates the evolution of the learned probability distribution relative to the target distribution, along with the reward trajectory over training steps. Input Parameters ---------- n_qubits : int Number of qubits in the circuit. target_distribution : np.ndarray Target probability distribution over computational basis states. ansatz : callable or None Custom parameterized circuit ansatz. If ``None``, a default RY-based ansatz is used. max_steps : int Maximum number of optimization steps per episode. tolerance : float Reward threshold for early termination. alpha : float Weight balancing KL divergence and L2 distance. beta : float Penalty weight for step count. ffmpeg : bool Whether to use FFmpeg when saving animations. See Also -------- :doc:`tutorials/probability` Tutorial on probability distribution learning with variational circuits. """ def __init__(self, n_qubits: int, target_distribution: np.ndarray, ansatz=None,**kwargs): super(ProbabilityV0, self).__init__() assert np.isclose(np.sum(target_distribution), 1.0), \ "Target distribution must sum to 1." self.n_qubits = n_qubits self.target_distribution = target_distribution self.max_steps = kwargs.get("max_steps", 100) self.tolerance = kwargs.get("tolerance", -1e3) self.alpha = kwargs.get("alpha", 0.5) # weight for KL vs L2 self.beta = kwargs.get("beta", 0.01) # step penalty weight ffmpeg = kwargs.get("ffmpeg", False) self.render_extension = "mp4" if ffmpeg else "gif" self.writer = "ffmpeg" if ffmpeg else "pillow" if ffmpeg==True and shutil.which("ffmpeg") is None: raise ValueError("ffmpeg not found on system. Please install ffmpeg or set ffmpeg=False") # Define PennyLane device self.dev = qml.device("default.qubit", wires=self.n_qubits) # If no ansatz is provided, define a simple one if ansatz is None: def default_ansatz(params, wires): for i, w in enumerate(wires): qml.RY(params[i], wires=w) self.ansatz = default_ansatz self.n_params = self.n_qubits else: self.ansatz = ansatz try: self.n_params = ansatz.n_params # If ansatz object has attribute except: raise ValueError("Please specify ansatz with n_params attribute.") # QNode @qml.qnode(self.dev) def circuit(params): self.ansatz(params, wires=range(self.n_qubits)) return qml.probs(wires=range(self.n_qubits)) self.circuit = circuit # Spaces self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(self.n_params,), dtype=np.float32) self.observation_space = spaces.Box(low=0, high=1, shape=(2**self.n_qubits,), dtype=np.float32) # Internal state self.params = np.random.uniform(0, 2*np.pi, size=self.n_params) self.current_step = 0 self.history = [] self.rewards = []
[docs] def get_reward(self, params): """ Compute the reward for a given set of circuit parameters. The reward is based on a weighted combination of: - Kullback–Leibler (KL) divergence between the target distribution and the circuit output distribution. - L2 distance between the target and circuit distributions. Parameters ---------- params : np.ndarray Vector of variational circuit parameters. Returns ------- float Scalar reward value encouraging the circuit output distribution to match the target distribution. """ probs = self.circuit(params) # KL divergence (target || probs) kl_div = np.sum(self.target_distribution * np.log((self.target_distribution + 1e-10) / (probs + 1e-10))) # L2 error l2_error = np.linalg.norm(self.target_distribution - probs, ord=2) # Reward reward = -(self.alpha * kl_div + (1 - self.alpha) * l2_error) return -reward
[docs] def step(self, action): """ Execute one optimization step. Updates the circuit parameters using the provided action, evaluates the resulting probability distribution, computes the reward, and checks termination conditions. Parameters ---------- action : np.ndarray Parameter update vector applied additively to the current circuit parameters. Returns ------- observation : np.ndarray Probability distribution over computational basis states produced by the circuit after the parameter update. reward : float Reward value after applying the action. done : bool True if the episode has terminated due to reaching the reward tolerance or the maximum number of steps. info : dict Empty dictionary provided for compatibility with Gymnasium-style APIs. """ self.params = (self.params + action) # keep params bounded self.current_step += 1 probs = self.circuit(self.params) reward = self.get_reward(self.params) done = reward < self.tolerance or self.current_step >= self.max_steps self.history.append(probs) self.rewards.append(reward) return probs, reward, done, {}
[docs] def reset(self): """ Reset the environment to a random initial parameter configuration. Initializes the circuit parameters randomly, clears episode history, and resets the step counter. Returns ------- observation : np.ndarray Initial circuit parameter vector. info : dict Empty dictionary provided for compatibility with Gymnasium-style APIs. """ self.params = np.random.uniform(0, 2*np.pi, size=self.n_params) self.current_step = 0 self.history = [] self.rewards = [] return self.params, {}
[docs] def render(self, save_path_without_extension=None): """ Render the evolution of the probability distribution over training steps. The animation shows a bar plot comparing the target probability distribution with the circuit's predicted distribution at each step. Reward values are displayed in the plot title. Parameters ---------- save_path_without_extension : str or None, optional Path (without file extension) to save the animation. If provided, the animation is saved using the configured writer (MP4 for FFmpeg or GIF for Pillow). If None, the animation is displayed interactively. Returns ------- None This method produces a visualization but does not return a value. """ fig, ax = plt.subplots(figsize=(10, 5)) x = np.arange(2**self.n_qubits) width = 0.4 target_bar = ax.bar(x - 0.2, self.target_distribution, width=width, label="Target") current_bar = ax.bar(x + 0.2, self.history[0], width=width, label="Prediction") ax.set_ylim(0, 1) ax.set_xticks(range(len(self.history[0]))) ax.set_xticklabels([f"|{i}⟩" for i in range(len(self.history[0]))]) ax.set_xlabel("Basis states") ax.set_ylabel("Probability") ax.legend() def update(frame): probs = self.history[frame] for bar, new_height in zip(current_bar, probs): bar.set_height(new_height) ax.set_title(f"Step {frame} | Reward: {np.array(self.rewards[frame].item()):.4f}") return current_bar ani = animation.FuncAnimation(fig, update, frames=len(self.history), blit=False) if save_path_without_extension: ani.save(f"{save_path_without_extension}.{self.render_extension}", writer=self.writer, fps=2) else: plt.show()