RLVE_Gym

Sleeping

File size: 3,967 Bytes

3bf8430

import random
from typing import Optional
from ...environment import VerifiableEnvironment


class CombinationOddSubsequenceCounting_Environment(VerifiableEnvironment) : # Source : https://www.luogu.com.cn/problem/P3773
    prompt_template = \
r"""You are given a sequence of **distinct** integers: {array}

Please count the number of subsequences (not necessarily contiguous, but the order must be preserved) a[1], ..., a[k] such that:
1. k ≥ 2 (the subsequence must have at least two elements);
2. C(a[1], a[2]) × C(a[2], a[3]) × ... × C(a[k−1], a[k]) is **odd**, where C(x, y) denotes the binomial coefficient "x choose y".

**Output Format:** A single integer — the number of such valid subsequences."""

    def __init__(self,
                 wrong_format : float = -1.0, rewarding_strategy : str = "(min/max)^beta", rewarding_weight : float = 1.0, rewarding_beta : float = 10.0,
                 **kwargs) :
        """
        Initialize the CombinationOddSubsequenceCounting_Environment instance.
        """
        super().__init__(**kwargs)

        self.rewards = {
            "wrong_format": wrong_format,
            "rewarding_strategy": rewarding_strategy,
            "rewarding_weight": rewarding_weight,
            "rewarding_beta": rewarding_beta,
        }
    

    def _generate(self) -> None :
        assert "N" in self.parameter, "N is required in parameter"
        N = self.parameter["N"]
        assert N >= 2, "N should be greater than or equal to 2"

        A = self.parameter["A"] = random.sample(range(1, 2 * N), N)
        random.shuffle(A)


        max_val = max(A)
        T = [-1] * (max_val + 1)
        for i, v in enumerate(A):
            T[v] = i

        # f[i] = number of non-increasing subsequences starting at i (including length-1)
        f = [0] * N
        ans = 0

        # DP from right to left
        for i in range(N - 1, -1, -1):
            mask = A[i]
            cnt = 1  # the subsequence [A[i]] itself
            # enumerate all non-zero proper submasks j of mask
            j = mask & (mask - 1)
            while j:
                idx = T[j]
                # if that value appears later in the sequence, extend subsequences
                if idx > i:
                    cnt += f[idx]
                j = mask & (j - 1)
            f[i] = cnt
            ans += cnt

        # subtract the length-1 subsequences to count only those of length >= 2
        ans -= N
        self.parameter["reference_answer"] = ans
    

    def _prompt_generate(self) -> str :
        return self.prompt_template.format(array = " ".join(map(str, self.parameter["A"])))
    

    def _process(self, answer : Optional[str]) -> Optional[int] :
        if answer is not None :
            answer = answer.strip()
            try :
                int_answer = int(answer)
                return int_answer
            except ValueError :
                return None
        else :
            return None
    

    def scorer(self, output : str) -> float :
        processed_result = self.processor(output)
        if processed_result is not None :
            if processed_result < 0 :
                return self.rewards["wrong_format"]
            
            if self.rewards["rewarding_strategy"] == "(min/max)^beta" :
                a, b = self.parameter["reference_answer"], processed_result
                if b == 0 :
                    return self.rewards["rewarding_weight"] * (a == 0)
                return self.rewards["rewarding_weight"] * (((min(a, b) / max(a, b))) ** self.rewards["rewarding_beta"])
            elif self.rewards["rewarding_strategy"] == "gold=answer" :
                return self.rewards["rewarding_weight"] * (processed_result == self.parameter["reference_answer"])
            else :
                raise NotImplementedError("Unknown rewarding strategy: {}".format(self.rewards["rewarding_strategy"]))
        else :
            return self.rewards["wrong_format"]