Spaces:

ethanlshen
/

SuperposedDecoding

Runtime error

SuperposedDecoding / superposed /llama /metrics.py

Ethan Shen

Initial commit

dda1539 over 1 year ago

4.2 kB

	import torch
	import nltk
	from nltk.translate.bleu_score import SmoothingFunction
	from tqdm import tqdm

	def calculate_perplexity(model, tokens, prompt_len, bsz=1, marker=False):
	"""
	Calculate perplexity of given tokens using provided model, ignoring padding tokens.
	Args:
	model: Llama model
	tokens (List[List[int]] or torch.Tensor): Input tokens (n_prompt * n_draft, seqlen)
	prompt_len (int): Prefix length
	bsz (int): Batch size
	marker (bool): Whether to show progress bar
	Returns:
	Perplexity across all generations (n_prompt * n_drafts)
	"""
	it = range(0, len(tokens), bsz)
	if marker:
	it = tqdm(it)
	start = 0
	ppl = torch.zeros(len(tokens))
	for start in it:
	end = start + bsz
	data = tokens[start : end]
	if not isinstance(data, list):
	data = data.tolist()
	# Remove any padding tokens (-1) in generations
	for d_idx in range(len(data)):
	cur = data[d_idx]
	if -1 in cur:
	data[d_idx] = cur[:cur.index(-1)]
	# Calculate cross entropy loss on tokens
	ce_loss = model.generate(data, max_gen_len=0, temperature=-1, top_p=-1, grade=True)
	# Cut off everything past `prompt_len`
	ce_loss = ce_loss[:, prompt_len-1:] # Subtract 1 because the first token (start token) is removed
	# Calculate perplexity
	lengths = (ce_loss != 0).sum(dim=-1)
	mean = ce_loss.sum(dim=-1) / lengths
	ppl[start : end] = torch.exp(-1 * mean)
	return ppl

	def calculate_diversity(generations, k=4):
	"""
	Calculate diversity of generations using SELF-BLEU.
	Args:
	generations (List[List[List[int]]]): Tokenized input
	k (int, Optional): Number of n-grams to use for bleu
	Returns:
	Average diversity across all generations (float)
	"""
	nltk.download('punkt') # Can be deleted once downloaded
	smooth = SmoothingFunction()
	bleus = []

	for drafts in generations:
	tokenized_drafts = []
	# Stringify tokens
	for d in drafts:
	if -1 in d:
	d = d[:d.index(-1)]
	tokenized_drafts.append([str(n) for n in d])
	# Calculate SELF-BLEU
	minlength = min([len(g) for g in tokenized_drafts])
	minlength = min(minlength, k)
	weights = tuple((1. / minlength for _ in range(minlength)))
	for i in range(len(drafts)):
	# Create source and reference (all other drafts)
	src = tokenized_drafts[i]
	ref = tokenized_drafts[:i] + tokenized_drafts[i+1:]
	tmp = nltk.translate.bleu_score.sentence_bleu(references=ref,
	hypothesis=src,
	weights=weights,
	smoothing_function=smooth.method1)
	bleus.append(tmp)
	bleus = torch.Tensor(bleus)
	return torch.mean(bleus)


	def calculate_ngram_repetition(sequences):
	"""
	Calculate uniqueness scores of `sequences`.
	Args:
	sequences (List[List[int]]): Generated sequences
	Returns:
	(unigram_uniqueness, bigram_uniqueness, trigram_uniqueness)
	"""
	u_total = 0
	b_total = 0
	t_total = 0
	# Iterate through all sequences indiscriminately
	for gen in sequences:
	if -1 in gen:
	gen = gen[:gen.index(-1)]
	unigrams, bigrams, trigrams = [], [], []
	o = [str(i) for i in gen]
	# Create lists of n-grams for the generation
	for i in range(len(o)):
	unigrams.append(o[i])
	for i in range(len(o) - 1):
	bigrams.append(o[i] + '_' + o[i + 1])
	for i in range(len(o) - 2):
	trigrams.append(o[i] + '_' + o[i + 1] + '_' + o[i + 2])
	# Calculate uniqueness of the generation
	u, b, t = len(set(unigrams)) / len(unigrams), len(set(bigrams)) / len(bigrams), len(set(trigrams)) / len(trigrams)
	u_total += u
	b_total += b
	t_total += t
	return u_total / len(sequences), b_total / len(sequences), t_total / len(sequences)