Skip to content

Commit 8e2ac63

Browse files
committed
we should only have the country name for the dialect
1 parent 0367f3a commit 8e2ac63

File tree

556 files changed

+771
-771
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

556 files changed

+771
-771
lines changed

README.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ The first online catalogue for Arabic NLP datasets. This catalogue contains more
2020
- `License` license of the dataset
2121
- `Year` year of the publishing the dataset/paper
2222
- `Language` ar or multilingual
23-
- `Dialect` region ar-LEV: (Arabic(Levant)), country ar-EGY: (Arabic (Egypt)) or type ar-MSA: (Arabic (Modern Standard Arabic))
23+
- `Dialect` region Levant, country ar-EGY: (Arabic (Egypt)) or type Modern Standard Arabic
2424
- `Domain` social media, news articles, reviews, commentary, books, transcribed audio or other
2525
- `Form` text, audio or sign language
2626
- `Collection style` crawling, crawling and annotation (translation), crawling and annotation (other), machine translation, human translation, human curation or other
@@ -72,7 +72,7 @@ which gives the following output
7272
'Cost': '',
7373
'Derived From': '',
7474
'Description': 'the first Levantine Dialect Corpus (SDC) covering data from the four dialects spoken in Palestine, Jordan, Lebanon and Syria.',
75-
'Dialect': 'ar-LEV: (Arabic(Levant))',
75+
'Dialect': 'Levant',
7676
'Domain': 'social media',
7777
'Ethical Risks': 'Medium',
7878
'Form': 'text',
@@ -85,19 +85,19 @@ which gives the following output
8585
'Paper Title': 'Shami: A Corpus of Levantine Arabic Dialects',
8686
'Provider': 'Multiple institutions ',
8787
'Script': 'Arab',
88-
'Subsets': [{'Dialect': 'ar-JO: (Arabic (Jordan))',
88+
'Subsets': [{'Dialect': 'Jordan',
8989
'Name': 'Jordanian',
9090
'Unit': 'sentences',
9191
'Volume': '32,078'},
9292
{'Dialect': 'ar-PS: (Arabic (Palestinian Territories))',
9393
'Name': 'Palestanian',
9494
'Unit': 'sentences',
9595
'Volume': '21,264'},
96-
{'Dialect': 'ar-SY: (Arabic (Syria))',
96+
{'Dialect': 'Syria',
9797
'Name': 'Syrian',
9898
'Unit': 'sentences',
9999
'Volume': '48,159'},
100-
{'Dialect': 'ar-LB: (Arabic (Lebanon))',
100+
{'Dialect': 'Lebanon',
101101
'Name': 'Lebanese',
102102
'Unit': 'sentences',
103103
'Volume': '16,304'}],

datasets/1993-2007_united_nations_parallel_text.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2013,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "other",
1111
"Form": "text",
1212
"Collection Style": "other",

datasets/1997_hub5_arabic_evaluation.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2002,
88
"Language": "ar",
9-
"Dialect": "ar-EG: (Arabic (Egypt))",
9+
"Dialect": "Egypt",
1010
"Domain": "transcribed audio",
1111
"Form": "spoken",
1212
"Collection Style": "other",

datasets/1997_hub5_arabic_transcripts.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2002,
88
"Language": "ar",
9-
"Dialect": "ar-EG: (Arabic (Egypt))",
9+
"Dialect": "Egypt",
1010
"Domain": "transcribed audio",
1111
"Form": "text",
1212
"Collection Style": "other",

datasets/2003_nist_language_recognition_evaluation.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2006,
88
"Language": "multilingual",
9-
"Dialect": "ar-EG: (Arabic (Egypt))",
9+
"Dialect": "Egypt",
1010
"Domain": "transcribed audio",
1111
"Form": "spoken",
1212
"Collection Style": "other",

datasets/2005_nist_speaker_recognition_evaluation_test_data.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2011,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "transcribed audio",
1111
"Form": "spoken",
1212
"Collection Style": "other",

datasets/2005_nist_speaker_recognition_evaluation_training_data.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2011,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "transcribed audio",
1111
"Form": "spoken",
1212
"Collection Style": "other",

datasets/2006_conll_shared_task_-_arabic_&_czech.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2006,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "news articles",
1111
"Form": "text",
1212
"Collection Style": "other",

datasets/2006_nist_speaker_recognition_evaluation_test_set_part_1.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2011,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "transcribed audio",
1111
"Form": "spoken",
1212
"Collection Style": "other",

datasets/2006_nist_speaker_recognition_evaluation_test_set_part_2.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2012,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "transcribed audio",
1111
"Form": "spoken",
1212
"Collection Style": "other",

datasets/2006_nist_speaker_recognition_evaluation_training_set.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2011,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "transcribed audio",
1111
"Form": "spoken",
1212
"Collection Style": "other",

datasets/2006_nist_spoken_term_detection_development_set.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2011,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "transcribed audio",
1111
"Form": "spoken",
1212
"Collection Style": "other",

datasets/2006_nist_spoken_term_detection_evaluation_set.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2011,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "transcribed audio",
1111
"Form": "spoken",
1212
"Collection Style": "other",

datasets/2007_conll_shared_task_-_arabic_&_english.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2018,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "news articles",
1111
"Form": "text",
1212
"Collection Style": "other",

datasets/2007_nist_language_recognition_evaluation_supplemental_training_set.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2009,
88
"Language": "multilingual",
9-
"Dialect": "ar-EG: (Arabic (Egypt))",
9+
"Dialect": "Egypt",
1010
"Domain": "transcribed audio",
1111
"Form": "spoken",
1212
"Collection Style": "other",

datasets/2007_nist_language_recognition_evaluation_test_set.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2009,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "transcribed audio",
1111
"Form": "spoken",
1212
"Collection Style": "other",

datasets/2008_2010_nist_metrics_for_machine_translation_(metricsmatr)_gale_evaluation_set.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2011,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "web pages",
1111
"Form": "text",
1212
"Collection Style": "other",

datasets/2008_nist_metrics_for_machine_translation_(metricsmatr08)_development_data.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2009,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "news articles",
1111
"Form": "text",
1212
"Collection Style": "other",

datasets/2018_nist_speaker_recognition_evaluation_test_set.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2020,
88
"Language": "multilingual",
9-
"Dialect": "ar-TN: (Arabic (Tunisia))",
9+
"Dialect": "Tunisia",
1010
"Domain": "transcribed audio",
1111
"Form": "spoken",
1212
"Collection Style": "other",

datasets/a-speechdb.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "Non Commercial Use - ELRA END USER",
77
"Year": 2011,
88
"Language": "ar",
9-
"Dialect": "ar-EG: (Arabic (Egypt))",
9+
"Dialect": "Egypt",
1010
"Domain": "transcribed audio",
1111
"Form": "spoken",
1212
"Collection Style": "other",

datasets/a7'ta.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "unknown",
77
"Year": 2019,
88
"Language": "ar",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "books",
1111
"Form": "text",
1212
"Collection Style": "crawling",

datasets/a_corpus_of_arabic_literature_(19-20th_centuries)_for_stylometric_tests.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "CC BY 4.0",
77
"Year": 2021,
88
"Language": "ar",
9-
"Dialect": "ar-CLS: (Arabic (Classic))",
9+
"Dialect": "Classical Arabic",
1010
"Domain": "books",
1111
"Form": "text",
1212
"Collection Style": "crawling",

datasets/ace_2004_multilingual_training_corpus.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2004,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "other",
1111
"Form": "text",
1212
"Collection Style": "other",

datasets/ace_2005_multilingual_training_corpus.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2006,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "other",
1111
"Form": "text",
1212
"Collection Style": "other",

datasets/ace_2007_multilingual_training_corpus.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2014,
88
"Language": "multilingual",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "news articles",
1111
"Form": "text",
1212
"Collection Style": "other",

datasets/acqad.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "unknown",
77
"Year": 2022,
88
"Language": "ar",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "wikipedia",
1111
"Form": "text",
1212
"Collection Style": "crawling",

datasets/adcc.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"License": "unknown",
77
"Year": 2017,
88
"Language": "ar",
9-
"Dialect": "ar-MSA: (Arabic (Modern Standard Arabic))",
9+
"Dialect": "Modern Standard Arabic",
1010
"Domain": "other",
1111
"Form": "text",
1212
"Collection Style": "other",

datasets/adi-17.json

+17-17
Original file line numberDiff line numberDiff line change
@@ -3,103 +3,103 @@
33
"Subsets": [
44
{
55
"Name": "Algeria",
6-
"Dialect": "ar-DZ: (Arabic (Algeria))",
6+
"Dialect": "Algeria",
77
"Volume": "115.7",
88
"Unit": "hours"
99
},
1010
{
1111
"Name": "Egypt",
12-
"Dialect": "ar-EG: (Arabic (Egypt))",
12+
"Dialect": "Egypt",
1313
"Volume": "451.1",
1414
"Unit": "hours"
1515
},
1616
{
1717
"Name": "Iraq",
18-
"Dialect": "ar-IQ: (Arabic (Iraq))",
18+
"Dialect": "Iraq",
1919
"Volume": "815.8",
2020
"Unit": "hours"
2121
},
2222
{
2323
"Name": "Jordan",
24-
"Dialect": "ar-JO: (Arabic (Jordan))",
24+
"Dialect": "Jordan",
2525
"Volume": "25.9",
2626
"Unit": "hours"
2727
},
2828
{
2929
"Name": "Saudi Arabia",
30-
"Dialect": "ar-SA: (Arabic (Saudi Arabia))",
30+
"Dialect": "Saudi Arabia",
3131
"Volume": "186.1",
3232
"Unit": "hours"
3333
},
3434
{
3535
"Name": "Kuwait",
36-
"Dialect": "ar-KW: (Arabic (Kuwait))",
36+
"Dialect": "Kuwait",
3737
"Volume": "108.2",
3838
"Unit": "hours"
3939
},
4040
{
4141
"Name": "Lebanon",
42-
"Dialect": "ar-LB: (Arabic (Lebanon))",
42+
"Dialect": "Lebanon",
4343
"Volume": "116.8",
4444
"Unit": "hours"
4545
},
4646
{
4747
"Name": "Libya",
48-
"Dialect": "ar-LY: (Arabic (Libya))",
48+
"Dialect": "Libya",
4949
"Volume": "127.4",
5050
"Unit": "hours"
5151
},
5252
{
5353
"Name": "Mauritania",
54-
"Dialect": "ar-MR: (Arabic (Mauritania))",
54+
"Dialect": "Mauritania",
5555
"Volume": "456.4",
5656
"Unit": "hours"
5757
},
5858
{
5959
"Name": "Morocco",
60-
"Dialect": "ar-MA: (Arabic (Morocco))",
60+
"Dialect": "Morocco",
6161
"Volume": "57.8",
6262
"Unit": "hours"
6363
},
6464
{
6565
"Name": "Oman",
66-
"Dialect": "ar-OM: (Arabic (Oman))",
66+
"Dialect": "Oman",
6767
"Volume": "58.5",
6868
"Unit": "hours"
6969
},
7070
{
7171
"Name": "Palestine",
72-
"Dialect": "ar-PS: (Arabic (Palestine))",
72+
"Dialect": "Palestine",
7373
"Volume": "121.4",
7474
"Unit": "hours"
7575
},
7676
{
7777
"Name": "Qatar",
78-
"Dialect": "ar-QA: (Arabic (Qatar))",
78+
"Dialect": "Qatar",
7979
"Volume": "62.3",
8080
"Unit": "hours"
8181
},
8282
{
8383
"Name": "Sudan",
84-
"Dialect": "ar-SD: (Arabic (Sudan))",
84+
"Dialect": "Sudan",
8585
"Volume": "47.7",
8686
"Unit": "hours"
8787
},
8888
{
8989
"Name": "Syria",
90-
"Dialect": "ar-SY: (Arabic (Syria))",
90+
"Dialect": "Syria",
9191
"Volume": "119.5",
9292
"Unit": "hours"
9393
},
9494
{
9595
"Name": "UAE",
96-
"Dialect": "ar-AE: (Arabic (United Arab Emirates))",
96+
"Dialect": "United Arab Emirates",
9797
"Volume": "108.4",
9898
"Unit": "hours"
9999
},
100100
{
101101
"Name": "Yemen",
102-
"Dialect": "ar-YE: (Arabic (Yemen))",
102+
"Dialect": "Yemen",
103103
"Volume": "53.4",
104104
"Unit": "hours"
105105
}

0 commit comments

Comments
 (0)