Skip to content

Commit c861093

Browse files
committed
remove nan values
1 parent dd11107 commit c861093

File tree

709 files changed

+4857
-4857
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

709 files changed

+4857
-4857
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ vendor
77
.pytest_cache
88
.hypothesis
99
Gemfile.lock
10+
save_jsons.py

assets/js/search.js

+2-2
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@ function accessBadge(text) {
2222
function setAttributes(attribute, element) {
2323

2424
if (attribute == "Cost") {
25-
if (element[attribute] != "nan") return element[attribute]
25+
if (element[attribute] != "") return element[attribute]
2626
else return "0$"
2727
} else if (attribute == "Access") {
2828
return accessBadge(element[attribute])
2929
}
30-
else if (element[attribute] != "nan") {
30+
else if (element[attribute] != "") {
3131
return element[attribute]
3232
}
3333

data.json

-1
This file was deleted.

datasets/101_billion_arabic_words_dataset.json

+3-3
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222
"Tokenized": "No",
2323
"Host": "HuggingFace",
2424
"Access": "Free",
25-
"Cost": "nan",
25+
"Cost": "",
2626
"Test Split": "No",
2727
"Tasks": "text generation, language modeling",
2828
"Venue Title": "arXiv",
29-
"Citations": "nan",
29+
"Citations": "",
3030
"Venue Type": "preprint",
31-
"Venue Name": "nan",
31+
"Venue Name": "",
3232
"Authors": "Manel Aloui, Hasna Chouikhi, Ghaith Chaabane, Haithem Kchaou, and Chehir Dhaouadi",
3333
"Affiliations": "Clusterlab",
3434
"Abstract": "In recent years, Large Language Models (LLMs) have revolutionized the field of natural language processing, showcasing an impressive rise predominantly in English-centric domains. These advancements have set a global benchmark, inspiring significant efforts toward developing Arabic LLMs capable of understanding and generating the Arabic language with remarkable accuracy. Despite these advancements, a critical challenge persists: the potential bias in Arabic LLMs, primarily attributed to their reliance on datasets comprising English data that has been translated into Arabic. This reliance not only compromises the authenticity of the generated content but also reflects a broader issue\u2014the scarcity of original quality Arabic linguistic data. This study aims to address the data scarcity in the Arab world and to encourage the development of Arabic Language Models that are true to both the linguistic and nuances of the region. We undertook a large-scale data mining project, extracting a substantial volume of text from the Common Crawl WET files, specifically targeting Arabic content. The extracted data underwent a rigorous cleaning and deduplication process, using innovative techniques to ensure the integrity and uniqueness of the dataset. The result is the 101 Billion Arabic Words Dataset, the largest Arabic dataset available to date, which can significantly contribute to the development of authentic Arabic LLMs. This study not only highlights the potential for creating linguistically and culturally accurate Arabic LLMs but also sets a precedent for future research in enhancing the authenticity of Arabic language models.",
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"Name": "1993-2007 United Nations Parallel Text",
33
"Subsets": [],
4-
"HF Link": "nan",
4+
"HF Link": "",
55
"Link": "https://catalog.ldc.upenn.edu/LDC2013T06",
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2013,
@@ -15,22 +15,22 @@
1515
"Unit": "documents",
1616
"Ethical Risks": "Low",
1717
"Provider": "LDC",
18-
"Derived From": "nan",
19-
"Paper Title": "nan",
20-
"Paper Link": "nan",
18+
"Derived From": "",
19+
"Paper Title": "",
20+
"Paper Link": "",
2121
"Script": "Arab",
2222
"Tokenized": "No",
2323
"Host": "LDC",
2424
"Access": "With-Fee",
2525
"Cost": "175.00 $",
2626
"Test Split": "No",
2727
"Tasks": "machine translation",
28-
"Venue Title": "nan",
29-
"Citations": "nan",
30-
"Venue Type": "nan",
31-
"Venue Name": "nan",
32-
"Authors": "nan",
33-
"Affiliations": "nan",
34-
"Abstract": "nan",
28+
"Venue Title": "",
29+
"Citations": "",
30+
"Venue Type": "",
31+
"Venue Name": "",
32+
"Authors": "",
33+
"Affiliations": "",
34+
"Abstract": "",
3535
"Added By": "Zaid Alyafeai"
3636
}
+12-12
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"Name": "1997 HUB5 Arabic Evaluation",
33
"Subsets": [],
4-
"HF Link": "nan",
4+
"HF Link": "",
55
"Link": "https://catalog.ldc.upenn.edu/LDC2002S22",
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2002,
@@ -15,22 +15,22 @@
1515
"Unit": "documents",
1616
"Ethical Risks": "Low",
1717
"Provider": "LDC",
18-
"Derived From": "nan",
19-
"Paper Title": "nan",
20-
"Paper Link": "nan",
21-
"Script": "nan",
18+
"Derived From": "",
19+
"Paper Title": "",
20+
"Paper Link": "",
21+
"Script": "",
2222
"Tokenized": "No",
2323
"Host": "LDC",
2424
"Access": "With-Fee",
2525
"Cost": "1,500.00 $",
2626
"Test Split": "No",
2727
"Tasks": "speech recognition",
28-
"Venue Title": "nan",
29-
"Citations": "nan",
30-
"Venue Type": "nan",
31-
"Venue Name": "nan",
32-
"Authors": "nan",
33-
"Affiliations": "nan",
34-
"Abstract": "nan",
28+
"Venue Title": "",
29+
"Citations": "",
30+
"Venue Type": "",
31+
"Venue Name": "",
32+
"Authors": "",
33+
"Affiliations": "",
34+
"Abstract": "",
3535
"Added By": "Zaid Alyafeai"
3636
}
+12-12
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"Name": "1997 HUB5 Arabic Transcripts",
33
"Subsets": [],
4-
"HF Link": "nan",
4+
"HF Link": "",
55
"Link": "https://catalog.ldc.upenn.edu/LDC2002T39",
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2002,
@@ -15,22 +15,22 @@
1515
"Unit": "documents",
1616
"Ethical Risks": "Low",
1717
"Provider": "LDC",
18-
"Derived From": "nan",
19-
"Paper Title": "nan",
20-
"Paper Link": "nan",
21-
"Script": "nan",
18+
"Derived From": "",
19+
"Paper Title": "",
20+
"Paper Link": "",
21+
"Script": "",
2222
"Tokenized": "No",
2323
"Host": "LDC",
2424
"Access": "With-Fee",
2525
"Cost": "500.00 $",
2626
"Test Split": "No",
2727
"Tasks": "speech recognition",
28-
"Venue Title": "nan",
29-
"Citations": "nan",
30-
"Venue Type": "nan",
31-
"Venue Name": "nan",
32-
"Authors": "nan",
33-
"Affiliations": "nan",
34-
"Abstract": "nan",
28+
"Venue Title": "",
29+
"Citations": "",
30+
"Venue Type": "",
31+
"Venue Name": "",
32+
"Authors": "",
33+
"Affiliations": "",
34+
"Abstract": "",
3535
"Added By": "Zaid Alyafeai"
3636
}
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"Name": "2003 NIST Language Recognition Evaluation",
33
"Subsets": [],
4-
"HF Link": "nan",
4+
"HF Link": "",
55
"Link": "https://catalog.ldc.upenn.edu/LDC2006S31",
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2006,
@@ -15,22 +15,22 @@
1515
"Unit": "hours",
1616
"Ethical Risks": "Low",
1717
"Provider": "LDC",
18-
"Derived From": "nan",
19-
"Paper Title": "nan",
20-
"Paper Link": "nan",
18+
"Derived From": "",
19+
"Paper Title": "",
20+
"Paper Link": "",
2121
"Script": "Arab",
2222
"Tokenized": "No",
2323
"Host": "LDC",
2424
"Access": "With-Fee",
2525
"Cost": "500.00 $",
2626
"Test Split": "No",
2727
"Tasks": "language identification",
28-
"Venue Title": "nan",
29-
"Citations": "nan",
30-
"Venue Type": "nan",
31-
"Venue Name": "nan",
32-
"Authors": "nan",
33-
"Affiliations": "nan",
34-
"Abstract": "nan",
28+
"Venue Title": "",
29+
"Citations": "",
30+
"Venue Type": "",
31+
"Venue Name": "",
32+
"Authors": "",
33+
"Affiliations": "",
34+
"Abstract": "",
3535
"Added By": "Zaid Alyafeai"
3636
}
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"Name": "2003 NIST Rich Transcription Evaluation Data",
33
"Subsets": [],
4-
"HF Link": "nan",
4+
"HF Link": "",
55
"Link": "https://catalog.ldc.upenn.edu/LDC2007S10",
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2007,
@@ -15,22 +15,22 @@
1515
"Unit": "hours",
1616
"Ethical Risks": "Low",
1717
"Provider": "LDC",
18-
"Derived From": "nan",
19-
"Paper Title": "nan",
20-
"Paper Link": "nan",
18+
"Derived From": "",
19+
"Paper Title": "",
20+
"Paper Link": "",
2121
"Script": "Arab",
2222
"Tokenized": "No",
2323
"Host": "LDC",
2424
"Access": "With-Fee",
2525
"Cost": "2,000.00 $",
2626
"Test Split": "No",
2727
"Tasks": "speech recognition",
28-
"Venue Title": "nan",
29-
"Citations": "nan",
30-
"Venue Type": "nan",
31-
"Venue Name": "nan",
32-
"Authors": "nan",
33-
"Affiliations": "nan",
34-
"Abstract": "nan",
28+
"Venue Title": "",
29+
"Citations": "",
30+
"Venue Type": "",
31+
"Venue Name": "",
32+
"Authors": "",
33+
"Affiliations": "",
34+
"Abstract": "",
3535
"Added By": "Zaid Alyafeai"
3636
}
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"Name": "2005 NIST Speaker Recognition Evaluation Test Data",
33
"Subsets": [],
4-
"HF Link": "nan",
4+
"HF Link": "",
55
"Link": "https://catalog.ldc.upenn.edu/LDC2011S04",
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2011,
@@ -15,22 +15,22 @@
1515
"Unit": "hours",
1616
"Ethical Risks": "Low",
1717
"Provider": "LDC",
18-
"Derived From": "nan",
19-
"Paper Title": "nan",
20-
"Paper Link": "nan",
18+
"Derived From": "",
19+
"Paper Title": "",
20+
"Paper Link": "",
2121
"Script": "Arab",
2222
"Tokenized": "No",
2323
"Host": "LDC",
2424
"Access": "With-Fee",
2525
"Cost": "400.00 $",
2626
"Test Split": "No",
2727
"Tasks": "speaker identification",
28-
"Venue Title": "nan",
29-
"Citations": "nan",
30-
"Venue Type": "nan",
31-
"Venue Name": "nan",
32-
"Authors": "nan",
33-
"Affiliations": "nan",
34-
"Abstract": "nan",
28+
"Venue Title": "",
29+
"Citations": "",
30+
"Venue Type": "",
31+
"Venue Name": "",
32+
"Authors": "",
33+
"Affiliations": "",
34+
"Abstract": "",
3535
"Added By": "Zaid Alyafeai"
3636
}
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"Name": "2005 NIST Speaker Recognition Evaluation Training Data",
33
"Subsets": [],
4-
"HF Link": "nan",
4+
"HF Link": "",
55
"Link": "https://catalog.ldc.upenn.edu/LDC2011S01",
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2011,
@@ -15,22 +15,22 @@
1515
"Unit": "hours",
1616
"Ethical Risks": "Low",
1717
"Provider": "LDC",
18-
"Derived From": "nan",
19-
"Paper Title": "nan",
20-
"Paper Link": "nan",
18+
"Derived From": "",
19+
"Paper Title": "",
20+
"Paper Link": "",
2121
"Script": "Arab",
2222
"Tokenized": "No",
2323
"Host": "LDC",
2424
"Access": "With-Fee",
2525
"Cost": "350.00 $",
2626
"Test Split": "No",
2727
"Tasks": "speaker identification",
28-
"Venue Title": "nan",
29-
"Citations": "nan",
30-
"Venue Type": "nan",
31-
"Venue Name": "nan",
32-
"Authors": "nan",
33-
"Affiliations": "nan",
34-
"Abstract": "nan",
28+
"Venue Title": "",
29+
"Citations": "",
30+
"Venue Type": "",
31+
"Venue Name": "",
32+
"Authors": "",
33+
"Affiliations": "",
34+
"Abstract": "",
3535
"Added By": "Zaid Alyafeai"
3636
}
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"Name": "2006 CoNLL Shared Task - Arabic & Czech",
33
"Subsets": [],
4-
"HF Link": "nan",
4+
"HF Link": "",
55
"Link": "https://catalog.ldc.upenn.edu/LDC2015T12",
66
"License": "LDC User Agreement for Non-Members",
77
"Year": 2006,
@@ -11,26 +11,26 @@
1111
"Form": "text",
1212
"Collection Style": "other",
1313
"Description": "2006 CoNLL Shared Task - Arabic & Czech consists of Arabic and Czech dependency treebanks used as part of the CoNLL 2006 shared task on multi-lingual dependency parsing.",
14-
"Volume": "nan",
14+
"Volume": "",
1515
"Unit": "tokens",
1616
"Ethical Risks": "Low",
1717
"Provider": "LDC",
1818
"Derived From": "PADT",
19-
"Paper Title": "nan",
20-
"Paper Link": "nan",
19+
"Paper Title": "",
20+
"Paper Link": "",
2121
"Script": "Arab-Latn",
2222
"Tokenized": "No",
2323
"Host": "LDC",
2424
"Access": "Upon-Request",
25-
"Cost": "nan",
25+
"Cost": "",
2626
"Test Split": "No",
2727
"Tasks": "syntactic parsing",
28-
"Venue Title": "nan",
29-
"Citations": "nan",
30-
"Venue Type": "nan",
31-
"Venue Name": "nan",
32-
"Authors": "nan",
33-
"Affiliations": "nan",
34-
"Abstract": "nan",
28+
"Venue Title": "",
29+
"Citations": "",
30+
"Venue Type": "",
31+
"Venue Name": "",
32+
"Authors": "",
33+
"Affiliations": "",
34+
"Abstract": "",
3535
"Added By": "Zaid Alyafeai"
3636
}

0 commit comments

Comments
 (0)