| | """ |
| | OPUS (http://opus.nlpl.eu/) is a great collection of different parallel datasets for more than 400 languages. |
| | On the website, you can download parallel datasets for many languages in different formats. I found that |
| | the format "Bottom-left triangle: download plain text files (MOSES/GIZA++)" requires minimal |
| | overhead for post-processing to get it into a suitable format for this library. |
| | |
| | You can use the OPUS dataset to create multilingual sentence embeddings. This script contains code to download |
| | OPUS datasets for the desired languages and to create training files in the right format. |
| | |
| | 1) First, you need to install OpusTools (https://github.com/Helsinki-NLP/OpusTools/tree/master/opustools_pkg): |
| | pip install opustools |
| | |
| | 2) Once you have OpusTools installed, you can download data in the right format via: |
| | mkdir parallel-sentences |
| | opus_read -d [CORPUS] -s [SRC_LANG] -t [TRG_LANG] --write parallel-sentences/[FILENAME].tsv.gz -wm moses -dl opus -p raw |
| | |
| | For example: |
| | mkdir parallel-sentences |
| | opus_read -d JW300 -s en -t de --write parallel-sentences/JW300-en-de.tsv.gz -wm moses -dl opus -p raw |
| | |
| | This downloads the JW300 Corpus (http://opus.nlpl.eu/JW300.php) for English (en) and German (de) and write the output to |
| | parallel-sentences/JW300-en-de.tsv.gz |
| | |
| | |
| | #################### |
| | |
| | This python code automates the download and creation of the parallel sentences files. |
| | |
| | |
| | """ |
| | from opustools import OpusRead |
| | import os |
| |
|
| |
|
| | corpora = ['JW300'] |
| | source_languages = ['en'] |
| | target_languages = ['de', 'es', 'it', 'fr', 'ar', 'tr'] |
| |
|
| | output_folder = 'parallel-sentences' |
| | opus_download_folder = './opus' |
| |
|
| | |
| | os.makedirs(output_folder, exist_ok=True) |
| |
|
| | for corpus in corpora: |
| | for src_lang in source_languages: |
| | for trg_lang in target_languages: |
| | output_filename = os.path.join(output_folder, "{}-{}-{}.tsv.gz".format(corpus, src_lang, trg_lang)) |
| | if not os.path.exists(output_filename): |
| | print("Create:", output_filename) |
| | try: |
| | read = OpusRead(directory=corpus, source=src_lang, target=trg_lang, write=[output_filename], download_dir=opus_download_folder, preprocess='raw', write_mode='moses', suppress_prompts=True) |
| | read.printPairs() |
| | except: |
| | print("An error occured during the creation of", output_filename) |