RLVE_Gym

Sleeping

App Files Files Community

RLVE_Gym / server /Gym /environments /box_scheduling /environment.py

ZhiyuanZeng

Upload folder using huggingface_hub

3bf8430 verified about 1 month ago

raw

history blame contribute delete

6.18 kB

	import math
	import random
	from typing import Optional, List
	from bisect import bisect_left, insort
	from ...environment import VerifiableEnvironment


	class BoxScheduling_Environment(VerifiableEnvironment) : # Source : https://www.luogu.com.cn/problem/P3207
	prompt_template = \
	r"""You are given a sequence C: {C}

	Now, please determine two non-negative integer sequences X[1], ..., X[{N_minus_1}] and Y[1], ..., Y[{N_minus_1}] such that:
	- For 1 ≤ i ≤ {N_minus_1}, define: Pos[i] = (C[i] + {D} × X[i] + Y[i]) mod {N}
	- The values Pos[1], ..., Pos[{N_minus_1}] must be all distinct.
	- No Pos[i] can be equal to {S}.
	- Among all valid solutions:
	+ First, minimize the lexicographical order of sequence Y.
	+ If multiple solutions have the same Y, then choose the one with the smallest lexicographical order of X.

	Output Format: A single line containing Pos[1], ..., Pos[{N_minus_1}], separated by spaces."""

	def __init__(self,
	wrong_format : float = -1.0, invalid_solution : float = -0.5, rewarding_strategy : str = "mean([gold=answer])^beta", rewarding_beta : float = 10.0, rewarding_weight : float = +1.0,
	**kwargs) :
	"""
	Initialize the BoxScheduling_Environment instance.
	"""
	super().__init__(**kwargs)

	self.rewards = {
	"wrong_format" : wrong_format,
	"invalid_solution" : invalid_solution,
	"rewarding_strategy" : rewarding_strategy,
	"rewarding_beta" : rewarding_beta,
	"rewarding_weight" : rewarding_weight,
	}


	def _generate(self) -> None :
	assert "MAX_N" in self.parameter, "MAX_N is requid in parameter"
	MAX_N = self.parameter["MAX_N"]
	assert MAX_N >= 3, "MAX_N should be greater than or equal to 3"

	N = self.parameter["N"] = random.randint(3, MAX_N)
	C = self.parameter["C"] = [random.randint(0, N - 1) for _ in range(N - 1)]
	for iter in range(int(N ** 0.5)) :
	D = self.parameter["D"] = random.randint(1, N - 1)
	if math.gcd(D, N) > 1 :
	break
	S = self.parameter["S"] = random.randint(0, N - 1)


	c = [0] + C

	# 2) DSU for “next free” in a D‐cycle
	parent = list(range(N))
	def find(x):
	while parent[x] != x:
	parent[x] = parent[parent[x]]
	x = parent[x]
	return x

	# 3) prepare the multiset st of (residue_mod_G, count)
	G = math.gcd(D, N)
	con = N // G
	tar = S % G

	# st will be a sorted list of (residue, remaining_slots)
	st = []
	# we'll fill p[] as we go
	p = [0] * N

	# initialize
	for r in range(G):
	if r != tar:
	# all con slots available
	insort(st, (r, con))
	else:
	# reserve one for the empty slot at i=0
	p[0] = S
	# mark S as used by linking it to (S+D)%N
	parent[S] = find((S + D) % N)
	# if there are more in this class, keep (con-1)
	if con > 1:
	insort(st, (r, con - 1))

	# 4) assign positions for boxes 1..N-1
	for i in range(1, N):
	key = c[i] % G

	# find the first entry in st with residue >= key
	idx = bisect_left(st, (key, -1))
	if idx == len(st):
	# wrap around to the smallest residue
	idx = 0

	r, cnt = st.pop(idx)
	# if more remain in this residue‐class, put it back
	if cnt > 1:
	insort(st, (r, cnt - 1))

	# compute the base position before DSU‐skipping
	if r >= key:
	j = (c[i] + (r - key)) % N
	else:
	# jump up one multiple of G
	j = (((c[i] // G) + 1) * G + r) % N

	# find the actual next free slot in its D‐cycle
	pj = find(j)
	p[i] = pj
	# mark pj used
	parent[pj] = find((pj + D) % N)

	self.parameter["gold_answer"] = p[1 :]
	self.parameter["reference_answer"] = " ".join(map(str, self.parameter["gold_answer"]))


	def _prompt_generate(self) -> str :
	N = self.parameter["N"]
	return self.prompt_template.format(
	N = N,
	N_minus_1 = N - 1,
	C = " ".join("C[{}]={}".format(i + 1, Ci) for i, Ci in enumerate(self.parameter["C"])),
	D = self.parameter["D"],
	S = self.parameter["S"],
	)


	def _process(self, answer : Optional[str]) -> Optional[List] :
	if answer is not None :
	answer = answer.strip()
	try :
	answer_array = list(map(int, answer.split()))
	return answer_array
	except ValueError :
	return None # Invalid answer format
	else :
	return None # Invalid answer format


	def scorer(self, output : str) -> float :
	processed_result = self.processor(output)
	if processed_result is not None :
	assert isinstance(processed_result, list), "processed_result should be a list"

	Pos = processed_result
	if len(Pos) != self.parameter["N"] - 1 :
	return self.rewards["invalid_solution"]
	if set(Pos) != set(range(self.parameter["N"])) - {self.parameter["S"]} :
	return self.rewards["invalid_solution"]

	if self.rewards["rewarding_strategy"] == "mean([gold=answer])^beta" :
	return self.rewards["rewarding_weight"] * ((sum(int(a == b) for a, b in zip(self.parameter["gold_answer"], Pos)) / (self.parameter["N"] - 1)) ** self.rewards["rewarding_beta"])
	elif self.rewards["rewarding_strategy"] == "gold=answer" :
	return self.rewards["rewarding_weight"] * (self.parameter["gold_answer"] == Pos)
	else :
	raise NotImplementedError("Unknown rewarding strategy: {}".format(self.rewards["rewarding_strategy"]))
	else :
	return self.rewards["wrong_format"]