Instructions to use softcatala/paraphrase-ca with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use softcatala/paraphrase-ca with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("softcatala/paraphrase-ca", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| from transformers import T5Tokenizer, MT5ForConditionalGeneration | |
| from simpletransformers.t5 import T5Model | |
| import datetime | |
| import logging | |
| import os | |
| class Inference: | |
| def _discard_recommendations(self, original, proposal): | |
| proposal = proposal.lower() | |
| original = original.lower() | |
| if proposal == original: | |
| return True | |
| chars = [".", "!", " ", "?", ","] | |
| _proposal = proposal | |
| _original = original | |
| for char in chars: | |
| proposal = proposal.replace(char, "") | |
| original = original.replace(char, "") | |
| if proposal == original: | |
| return True | |
| return False | |
| # https://github.com/Vamsi995/Paraphrase-Generator/blob/master/evaluate.py | |
| def get_paraphrases( | |
| self, | |
| model_name, | |
| sentence, | |
| temperature, | |
| prefix="paraphrase: ", | |
| n_predictions=2, | |
| top_k=120, | |
| max_length=256, | |
| device="cpu", | |
| ): | |
| model = MT5ForConditionalGeneration.from_pretrained(model_name) | |
| tokenizer = T5Tokenizer.from_pretrained(model_name) | |
| discaded = 0 | |
| text = prefix + sentence + " </s>" | |
| encoding = tokenizer.encode_plus( | |
| text, pad_to_max_length=True, return_tensors="pt" | |
| ) | |
| input_ids, attention_masks = encoding["input_ids"].to(device), encoding[ | |
| "attention_mask" | |
| ].to(device) | |
| do_sample = True if temperature > 0 else False | |
| print(f"do_sample: {do_sample}") | |
| print(f"temperature: {temperature}") | |
| # https://huggingface.co/blog/how-to-generate | |
| # https://huggingface.co/transformers/v3.2.0/_modules/transformers/generation_utils.html | |
| model_output = model.generate( | |
| input_ids=input_ids, | |
| attention_mask=attention_masks, | |
| do_sample=do_sample, | |
| max_length=max_length, | |
| top_k=top_k, | |
| num_beams=n_predictions * 2, ## ask for twice since some will be discarted | |
| top_p=0.98, | |
| temperature=temperature, | |
| early_stopping=True, | |
| num_return_sequences=n_predictions * 2, | |
| ) | |
| logging.debug(f"{len(model_output)} predictions for {sentence}") | |
| outputs = [] | |
| for output in model_output: | |
| generated_sent = tokenizer.decode( | |
| output, skip_special_tokens=True, clean_up_tokenization_spaces=True | |
| ) | |
| if ( | |
| self._discard_recommendations(sentence, generated_sent) is False | |
| and generated_sent not in outputs | |
| ): | |
| generated_sent = generated_sent.replace("’", "'") | |
| outputs.append(generated_sent) | |
| else: | |
| logging.debug(f"Discarded: {generated_sent} - source:{sentence}") | |
| discaded = +1 | |
| if len(outputs) == n_predictions: | |
| break | |
| return outputs | |
| def main(): | |
| i = Inference() | |
| sentence = "Aquesta és una associació sense ànim de lucre amb la missió de fomentar la presència i l'ús del català." | |
| model = os.getcwd() | |
| options = i.get_paraphrases(model, sentence, 1.0) | |
| print(f"original: {sentence}") | |
| for option in options: | |
| print(f" {option}") | |
| if __name__ == "__main__": | |
| main() | |