Elron commited on
Commit
f86db44
1 Parent(s): cddf773

Upload metrics.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. metrics.py +194 -5
metrics.py CHANGED
@@ -1,7 +1,7 @@
1
  import re
2
  import string
3
  import uuid
4
- from abc import abstractmethod
5
  from collections import Counter
6
  from dataclasses import field
7
  from typing import Any, Dict, Generator, List, Optional, Tuple
@@ -361,7 +361,7 @@ class BulkInstanceMetric(SingleStreamOperator, MetricWithConfidenceInterval):
361
  references: List[List[Any]],
362
  predictions: List[Any],
363
  additional_inputs: List[Dict],
364
- ) -> Dict[str, Any]:
365
  pass
366
 
367
 
@@ -643,7 +643,6 @@ class HuggingfaceBulkMetric(BulkInstanceMetric):
643
  predictions: List[str],
644
  additional_inputs: List[Any],
645
  ) -> List[Dict[str, Any]]:
646
- passed_additional_inputs = {}
647
  passed_additional_inputs = {}
648
  for additional_input_field in self.hf_additional_input_fields:
649
  assert (
@@ -1247,7 +1246,7 @@ class SentenceBert(BulkInstanceMetric):
1247
  references: List[List[Any]],
1248
  predictions: List[Any],
1249
  additional_inputs: List[Dict],
1250
- ) -> List[Any]:
1251
  scores = []
1252
 
1253
  # we are in a multi-reference case (each prediction may have multiple
@@ -1292,7 +1291,7 @@ class Reward(BulkInstanceMetric):
1292
  references: List[List[Any]],
1293
  predictions: List[Any],
1294
  additional_inputs: List[Dict],
1295
- ) -> List[Any]:
1296
  # treat the references as the questions and the predictions as answers
1297
  # assume a single reference
1298
  questions = [refs[0] for refs in references]
@@ -1306,6 +1305,196 @@ class Reward(BulkInstanceMetric):
1306
  return self.pipe(inputs, batch_size=self.batch_size)
1307
 
1308
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1309
  class NDCG(GlobalMetric):
1310
  """Normalized Discounted Cumulative Gain: measures the quality of ranking with respect to ground truth ranking scores.
1311
 
 
1
  import re
2
  import string
3
  import uuid
4
+ from abc import ABC, abstractmethod
5
  from collections import Counter
6
  from dataclasses import field
7
  from typing import Any, Dict, Generator, List, Optional, Tuple
 
361
  references: List[List[Any]],
362
  predictions: List[Any],
363
  additional_inputs: List[Dict],
364
+ ) -> List[Dict[str, Any]]:
365
  pass
366
 
367
 
 
643
  predictions: List[str],
644
  additional_inputs: List[Any],
645
  ) -> List[Dict[str, Any]]:
 
646
  passed_additional_inputs = {}
647
  for additional_input_field in self.hf_additional_input_fields:
648
  assert (
 
1246
  references: List[List[Any]],
1247
  predictions: List[Any],
1248
  additional_inputs: List[Dict],
1249
+ ) -> List[Dict[str, Any]]:
1250
  scores = []
1251
 
1252
  # we are in a multi-reference case (each prediction may have multiple
 
1291
  references: List[List[Any]],
1292
  predictions: List[Any],
1293
  additional_inputs: List[Dict],
1294
+ ) -> List[Dict[str, Any]]:
1295
  # treat the references as the questions and the predictions as answers
1296
  # assume a single reference
1297
  questions = [refs[0] for refs in references]
 
1305
  return self.pipe(inputs, batch_size=self.batch_size)
1306
 
1307
 
1308
+ class Perplexity(BulkInstanceMetric):
1309
+ """Computes the likelihood of generating text Y after text X - P(Y|X)."""
1310
+
1311
+ main_score = "perplexity"
1312
+ reduction_map = {"mean": ["perplexity"]}
1313
+
1314
+ perplexity_prompt: str
1315
+
1316
+ batch_size: int = 32
1317
+ model_name: str
1318
+
1319
+ def compute(
1320
+ self,
1321
+ references: List[List[Any]],
1322
+ predictions: List[Any],
1323
+ additional_inputs: List[Dict],
1324
+ ) -> List[Dict[str, Any]]:
1325
+ """Computes the likelihood of generating text Y after text X - P(Y|X).
1326
+
1327
+ :param references: the list of Y texts as a list of singletons.
1328
+ :param predictions: the list of X texts as a plain list of strings
1329
+
1330
+ :return: the likelihood of generating text Y_i after text X_i = P(Y_i|X_i) for every i.
1331
+ """
1332
+ # make sure all references are singletons
1333
+ assert all(len(ref) == 1 for ref in references)
1334
+
1335
+ # add the instruction as prefix
1336
+ predictions = [f"{self.perplexity_prompt} {x}" for x in predictions]
1337
+ references = [y[0] for y in references]
1338
+
1339
+ # check if the model is enc-dec or dec-only to use the right perplexity computation
1340
+ from transformers import AutoConfig
1341
+
1342
+ config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=True)
1343
+ lm = (
1344
+ self.EncoderDecoderLM(model_name=self.model_name)
1345
+ if config.is_encoder_decoder is True
1346
+ else self.DecoderOnlyLM(model_name=self.model_name)
1347
+ )
1348
+
1349
+ # compute P(Q|P) and store in queue
1350
+ scores = lm.compute_lm(
1351
+ source=predictions, target=references, batch_size=self.batch_size
1352
+ )
1353
+
1354
+ return [{self.main_score: score} for score in scores]
1355
+
1356
+ class AbstractLM(ABC):
1357
+ def __init__(self, model_name):
1358
+ import torch
1359
+ from transformers import AutoTokenizer
1360
+
1361
+ self.model_name = model_name
1362
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
1363
+ self.model = self.model_class().from_pretrained(self.model_name)
1364
+ self.is_cuda = torch.cuda.is_available()
1365
+
1366
+ def compute_lm(self, source, target, batch_size: int) -> List[float]:
1367
+ import torch
1368
+
1369
+ scores = []
1370
+
1371
+ with torch.no_grad():
1372
+ # break the documents to batches
1373
+ n_batches = int(len(source) / batch_size)
1374
+ batch_range = range(n_batches + 1)
1375
+ for batch in batch_range:
1376
+ batch_source = source[batch * batch_size : (batch + 1) * batch_size]
1377
+ batch_target = target[batch * batch_size : (batch + 1) * batch_size]
1378
+ if len(batch_source) > 0:
1379
+ # tokenize the source and target
1380
+ tokens_source = self.tokenizer(
1381
+ batch_source, padding=True, return_tensors="pt"
1382
+ )
1383
+ tokens_target = self.tokenizer(
1384
+ batch_target, padding=True, return_tensors="pt"
1385
+ )
1386
+
1387
+ # compute the logits
1388
+ logits, labels = self.compute_batch(
1389
+ tokens_source, tokens_target
1390
+ )
1391
+
1392
+ # the model returns mean over all batch. We run the CE again without reduction
1393
+ # and extarct the mean for each document
1394
+ loss_fct = torch.nn.CrossEntropyLoss(
1395
+ ignore_index=-100, reduction="none"
1396
+ )
1397
+ loss = loss_fct(
1398
+ logits.view(-1, logits.size(-1)), labels.view(-1)
1399
+ )
1400
+ loss = loss.view(len(batch_source), -1)
1401
+
1402
+ # for each document, do mean only over the non zero values (sum(labels>0))
1403
+ batch_loss = torch.sum(loss, dim=1) / torch.sum(
1404
+ labels > 0, dim=1
1405
+ )
1406
+
1407
+ # append the batch scores to the list of all scores
1408
+ scores.append(batch_loss)
1409
+
1410
+ return torch.cat(scores, dim=0).tolist()
1411
+
1412
+ @abstractmethod
1413
+ def model_class(self):
1414
+ pass
1415
+
1416
+ @abstractmethod
1417
+ def compute_batch(self, tokens_source, tokens_target):
1418
+ pass
1419
+
1420
+ class EncoderDecoderLM(AbstractLM):
1421
+ def model_class(self):
1422
+ from transformers import AutoModelForSeq2SeqLM
1423
+
1424
+ return AutoModelForSeq2SeqLM
1425
+
1426
+ def compute_batch(self, tokens_source, tokens_target):
1427
+ tokens_docs_ids = tokens_source["input_ids"]
1428
+ attention = tokens_source["attention_mask"]
1429
+ labels = tokens_target["input_ids"]
1430
+
1431
+ if self.is_cuda:
1432
+ tokens_docs_ids, attention, labels = (
1433
+ tokens_docs_ids.cuda(),
1434
+ attention.cuda(),
1435
+ labels.cuda(),
1436
+ )
1437
+
1438
+ logits = self.model(
1439
+ input_ids=tokens_docs_ids.long(),
1440
+ attention_mask=attention.long(),
1441
+ labels=labels.long(),
1442
+ ).logits
1443
+
1444
+ # replace the padding token in the labels by -100
1445
+ labels[labels == self.tokenizer.pad_token_id] = -100
1446
+
1447
+ return logits, labels
1448
+
1449
+ class DecoderOnlyLM(AbstractLM):
1450
+ def model_class(self):
1451
+ from transformers import AutoModelForCausalLM
1452
+
1453
+ return AutoModelForCausalLM
1454
+
1455
+ def compute_batch(self, tokens_source, tokens_target):
1456
+ import torch
1457
+
1458
+ tokens = torch.cat(
1459
+ [tokens_source["input_ids"], tokens_target["input_ids"]], dim=1
1460
+ )
1461
+ attention = torch.cat(
1462
+ [tokens_source["attention_mask"], tokens_target["attention_mask"]],
1463
+ dim=1,
1464
+ )
1465
+ labels = torch.cat(
1466
+ [
1467
+ torch.zeros_like(tokens_source["input_ids"]).fill_(-100),
1468
+ tokens_target["input_ids"],
1469
+ ],
1470
+ dim=1,
1471
+ )
1472
+
1473
+ # replace the padding token in the labels by -100
1474
+ labels[labels == self.tokenizer.pad_token_id] = -100
1475
+
1476
+ if self.is_cuda:
1477
+ tokens, attention, labels = (
1478
+ tokens.cuda(),
1479
+ attention.cuda(),
1480
+ labels.cuda(),
1481
+ )
1482
+
1483
+ # no need to pass labels as we calculate the loss below per document
1484
+ model_output = self.model(
1485
+ input_ids=tokens.long(), attention_mask=attention.long()
1486
+ )
1487
+ logits = model_output.logits
1488
+
1489
+ # in decoder only, the first token is not being generated, it is taken from the input,
1490
+ # so the model is generating from token 2 to n+1. therefore, we need to skip the last
1491
+ # logit and the first label.
1492
+ shifted_logits = logits[..., :-1, :].contiguous()
1493
+ shifted_labels = labels[..., 1:].contiguous()
1494
+
1495
+ return shifted_logits, shifted_labels
1496
+
1497
+
1498
  class NDCG(GlobalMetric):
1499
  """Normalized Discounted Cumulative Gain: measures the quality of ranking with respect to ground truth ranking scores.
1500