import math import random from typing import Optional, List from bisect import bisect_left, insort from ...environment import VerifiableEnvironment class BoxScheduling_Environment(VerifiableEnvironment) : # Source : https://www.luogu.com.cn/problem/P3207 prompt_template = \ r"""You are given a sequence C: {C} Now, please determine two non-negative integer sequences X[1], ..., X[{N_minus_1}] and Y[1], ..., Y[{N_minus_1}] such that: - For 1 ≤ i ≤ {N_minus_1}, define: Pos[i] = (C[i] + {D} × X[i] + Y[i]) mod {N} - The values Pos[1], ..., Pos[{N_minus_1}] must be all distinct. - No Pos[i] can be equal to {S}. - Among all valid solutions: + First, minimize the lexicographical order of sequence Y. + If multiple solutions have the same Y, then choose the one with the smallest lexicographical order of X. **Output Format:** A single line containing Pos[1], ..., Pos[{N_minus_1}], separated by spaces.""" def __init__(self, wrong_format : float = -1.0, invalid_solution : float = -0.5, rewarding_strategy : str = "mean([gold=answer])^beta", rewarding_beta : float = 10.0, rewarding_weight : float = +1.0, **kwargs) : """ Initialize the BoxScheduling_Environment instance. """ super().__init__(**kwargs) self.rewards = { "wrong_format" : wrong_format, "invalid_solution" : invalid_solution, "rewarding_strategy" : rewarding_strategy, "rewarding_beta" : rewarding_beta, "rewarding_weight" : rewarding_weight, } def _generate(self) -> None : assert "MAX_N" in self.parameter, "MAX_N is requid in parameter" MAX_N = self.parameter["MAX_N"] assert MAX_N >= 3, "MAX_N should be greater than or equal to 3" N = self.parameter["N"] = random.randint(3, MAX_N) C = self.parameter["C"] = [random.randint(0, N - 1) for _ in range(N - 1)] for iter in range(int(N ** 0.5)) : D = self.parameter["D"] = random.randint(1, N - 1) if math.gcd(D, N) > 1 : break S = self.parameter["S"] = random.randint(0, N - 1) c = [0] + C # 2) DSU for “next free” in a D‐cycle parent = list(range(N)) def find(x): while parent[x] != x: parent[x] = parent[parent[x]] x = parent[x] return x # 3) prepare the multiset st of (residue_mod_G, count) G = math.gcd(D, N) con = N // G tar = S % G # st will be a sorted list of (residue, remaining_slots) st = [] # we'll fill p[] as we go p = [0] * N # initialize for r in range(G): if r != tar: # all con slots available insort(st, (r, con)) else: # reserve one for the empty slot at i=0 p[0] = S # mark S as used by linking it to (S+D)%N parent[S] = find((S + D) % N) # if there are more in this class, keep (con-1) if con > 1: insort(st, (r, con - 1)) # 4) assign positions for boxes 1..N-1 for i in range(1, N): key = c[i] % G # find the first entry in st with residue >= key idx = bisect_left(st, (key, -1)) if idx == len(st): # wrap around to the smallest residue idx = 0 r, cnt = st.pop(idx) # if more remain in this residue‐class, put it back if cnt > 1: insort(st, (r, cnt - 1)) # compute the base position before DSU‐skipping if r >= key: j = (c[i] + (r - key)) % N else: # jump up one multiple of G j = (((c[i] // G) + 1) * G + r) % N # find the actual next free slot in its D‐cycle pj = find(j) p[i] = pj # mark pj used parent[pj] = find((pj + D) % N) self.parameter["gold_answer"] = p[1 :] self.parameter["reference_answer"] = " ".join(map(str, self.parameter["gold_answer"])) def _prompt_generate(self) -> str : N = self.parameter["N"] return self.prompt_template.format( N = N, N_minus_1 = N - 1, C = " ".join("C[{}]={}".format(i + 1, Ci) for i, Ci in enumerate(self.parameter["C"])), D = self.parameter["D"], S = self.parameter["S"], ) def _process(self, answer : Optional[str]) -> Optional[List] : if answer is not None : answer = answer.strip() try : answer_array = list(map(int, answer.split())) return answer_array except ValueError : return None # Invalid answer format else : return None # Invalid answer format def scorer(self, output : str) -> float : processed_result = self.processor(output) if processed_result is not None : assert isinstance(processed_result, list), "processed_result should be a list" Pos = processed_result if len(Pos) != self.parameter["N"] - 1 : return self.rewards["invalid_solution"] if set(Pos) != set(range(self.parameter["N"])) - {self.parameter["S"]} : return self.rewards["invalid_solution"] if self.rewards["rewarding_strategy"] == "mean([gold=answer])^beta" : return self.rewards["rewarding_weight"] * ((sum(int(a == b) for a, b in zip(self.parameter["gold_answer"], Pos)) / (self.parameter["N"] - 1)) ** self.rewards["rewarding_beta"]) elif self.rewards["rewarding_strategy"] == "gold=answer" : return self.rewards["rewarding_weight"] * (self.parameter["gold_answer"] == Pos) else : raise NotImplementedError("Unknown rewarding strategy: {}".format(self.rewards["rewarding_strategy"])) else : return self.rewards["wrong_format"]