Changes
On September 9, 2025 at 5:36:05 AM UTC,

-
No fields were updated. See the metadata diff for more details.
f | 1 | { | f | 1 | { |
2 | "Observaciones": { | 2 | "Observaciones": { | ||
3 | "en": "Recommended citation : Styll, Patrick; Campillos-Llanos, | 3 | "en": "Recommended citation : Styll, Patrick; Campillos-Llanos, | ||
4 | Leonardo; 2025; Medical Artificial Intelligence text Detection in | 4 | Leonardo; 2025; Medical Artificial Intelligence text Detection in | ||
5 | Multilingual settings (MedAID-ML) [Dataset]; DIGITAL.CSIC; | 5 | Multilingual settings (MedAID-ML) [Dataset]; DIGITAL.CSIC; | ||
6 | https://doi.org/10.20350/digitalCSIC/17276", | 6 | https://doi.org/10.20350/digitalCSIC/17276", | ||
7 | "es": "Cita recomendada: Styll, Patrick; Campillos-Llanos, | 7 | "es": "Cita recomendada: Styll, Patrick; Campillos-Llanos, | ||
8 | Leonardo; 2025; Medical Artificial Intelligence text Detection in | 8 | Leonardo; 2025; Medical Artificial Intelligence text Detection in | ||
9 | Multilingual settings (MedAID-ML) [Dataset]; DIGITAL.CSIC; | 9 | Multilingual settings (MedAID-ML) [Dataset]; DIGITAL.CSIC; | ||
10 | https://doi.org/10.20350/digitalCSIC/17276" | 10 | https://doi.org/10.20350/digitalCSIC/17276" | ||
11 | }, | 11 | }, | ||
12 | "author": null, | 12 | "author": null, | ||
13 | "author_email": null, | 13 | "author_email": null, | ||
14 | "autor": { | 14 | "autor": { | ||
15 | "en": [ | 15 | "en": [ | ||
16 | "Patrick Styll", | 16 | "Patrick Styll", | ||
17 | "Leonardo Campillos-Llanos" | 17 | "Leonardo Campillos-Llanos" | ||
18 | ], | 18 | ], | ||
19 | "es": [ | 19 | "es": [ | ||
20 | "Patrick Styll", | 20 | "Patrick Styll", | ||
21 | "Leonardo Campillos-Llanos" | 21 | "Leonardo Campillos-Llanos" | ||
22 | ] | 22 | ] | ||
23 | }, | 23 | }, | ||
24 | "conforms_to": [], | 24 | "conforms_to": [], | ||
25 | "coverage_new": { | 25 | "coverage_new": { | ||
26 | "1": { | 26 | "1": { | ||
27 | "from": "2025-03-15T00:00:00", | 27 | "from": "2025-03-15T00:00:00", | ||
28 | "to": "2025-03-15T00:00:00" | 28 | "to": "2025-03-15T00:00:00" | ||
29 | } | 29 | } | ||
30 | }, | 30 | }, | ||
31 | "creator_user_id": "196556b3-e0c4-4c51-a9e6-f51cc752bc37", | 31 | "creator_user_id": "196556b3-e0c4-4c51-a9e6-f51cc752bc37", | ||
32 | "description": { | 32 | "description": { | ||
33 | "en": "This dataset was created by gathering human-authored | 33 | "en": "This dataset was created by gathering human-authored | ||
34 | corpora from several public health sites and generating additional | 34 | corpora from several public health sites and generating additional | ||
35 | data via three different LLMs: GPT-4o, Mistral-7B and Llama3-1. We | 35 | data via three different LLMs: GPT-4o, Mistral-7B and Llama3-1. We | ||
36 | included texts in English, Spanish, German and French data from the | 36 | included texts in English, Spanish, German and French data from the | ||
37 | biomedical domain. The current version gathers 50% AI-generated and | 37 | biomedical domain. The current version gathers 50% AI-generated and | ||
38 | 50% human-written texts. The following are the data we used:\r\n\r\n- | 38 | 50% human-written texts. The following are the data we used:\r\n\r\n- | ||
39 | Cochrane Library: This is a database of meta-analyses and systematic | 39 | Cochrane Library: This is a database of meta-analyses and systematic | ||
40 | reviews of updated results of clinical studies. We used abstracts of | 40 | reviews of updated results of clinical studies. We used abstracts of | ||
41 | systematic reviews in all four languages.\r\n\r\n- European Clinical | 41 | systematic reviews in all four languages.\r\n\r\n- European Clinical | ||
42 | Trials (EUCT): This agency that supervises and evaluates | 42 | Trials (EUCT): This agency that supervises and evaluates | ||
43 | pharmaceutical products of the European Union (EU). We downloaded | 43 | pharmaceutical products of the European Union (EU). We downloaded | ||
44 | parallel data from public assessment reports (EPARs) from 12 new | 44 | parallel data from public assessment reports (EPARs) from 12 new | ||
45 | medicinal products, and data from clinical trial protocols and | 45 | medicinal products, and data from clinical trial protocols and | ||
46 | eligibility criteria. We ensured the data were published only from | 46 | eligibility criteria. We ensured the data were published only from | ||
47 | January 2025 to date. The goal was gathering data that might not have | 47 | January 2025 to date. The goal was gathering data that might not have | ||
48 | been used to train the LLMs in our experiments.\r\n\r\n- European | 48 | been used to train the LLMs in our experiments.\r\n\r\n- European | ||
49 | Medicines Agency (EMA): This agency that supervises and evaluates | 49 | Medicines Agency (EMA): This agency that supervises and evaluates | ||
50 | pharmaceutical products of the European Union (EU). We downloaded | 50 | pharmaceutical products of the European Union (EU). We downloaded | ||
51 | parallel data from public assessment reports (EPARs) from 12 new | 51 | parallel data from public assessment reports (EPARs) from 12 new | ||
52 | medicinal products, and data from clinical trial protocols and | 52 | medicinal products, and data from clinical trial protocols and | ||
53 | eligibility criteria. We ensured the data were published only from | 53 | eligibility criteria. We ensured the data were published only from | ||
54 | January 2025 to date. The goal was gathering data that might not have | 54 | January 2025 to date. The goal was gathering data that might not have | ||
55 | been used to train the LLMs in our experiments.\r\n\r\n- European Food | 55 | been used to train the LLMs in our experiments.\r\n\r\n- European Food | ||
56 | Safety Authority (EFSA): This website provides a comprehensive range | 56 | Safety Authority (EFSA): This website provides a comprehensive range | ||
57 | of data about food consumption and chemical/biological monitoring | 57 | of data about food consumption and chemical/biological monitoring | ||
58 | data. We chose only the topics we deem necessary for our goals, | 58 | data. We chose only the topics we deem necessary for our goals, | ||
59 | therefore including a total of 51 topics. Processing: we manually | 59 | therefore including a total of 51 topics. Processing: we manually | ||
60 | split articles with a wordcount of above 1350 and manually ensured | 60 | split articles with a wordcount of above 1350 and manually ensured | ||
61 | their correctness and alignment in all languages.\r\n\r\n- European | 61 | their correctness and alignment in all languages.\r\n\r\n- European | ||
62 | Vaccination Information Portal (EVIP): it provides up-to-date | 62 | Vaccination Information Portal (EVIP): it provides up-to-date | ||
63 | information on vaccines and vaccination. The factsheets are available | 63 | information on vaccines and vaccination. The factsheets are available | ||
64 | in all languages, and consist of 20 texts each.\r\n\r\n- Immunize: | 64 | in all languages, and consist of 20 texts each.\r\n\r\n- Immunize: | ||
65 | Immunize.org (formerly known as the Immunization Action Coalition) is | 65 | Immunize.org (formerly known as the Immunization Action Coalition) is | ||
66 | a U.S.-based organization dedicated to providing comprehensive | 66 | a U.S.-based organization dedicated to providing comprehensive | ||
67 | immunization resources for healthcare professionals and the public. | 67 | immunization resources for healthcare professionals and the public. | ||
68 | Vaccine Information Sheets (VISs) have been translated into several | 68 | Vaccine Information Sheets (VISs) have been translated into several | ||
69 | languages, but not all of them contain all VISs. They are given as | 69 | languages, but not all of them contain all VISs. They are given as | ||
70 | PDFs, with 25 in Spanish, French and English, but only 21 in German. | 70 | PDFs, with 25 in Spanish, French and English, but only 21 in German. | ||
71 | Only PDFs overlapping in all languages were used.\r\n\r\n- Migration | 71 | Only PDFs overlapping in all languages were used.\r\n\r\n- Migration | ||
72 | und Gesundheit - German Ministry of Health (BFG): This portal provides | 72 | und Gesundheit - German Ministry of Health (BFG): This portal provides | ||
73 | multilingual health information tailored for migrants and refugees. | 73 | multilingual health information tailored for migrants and refugees. | ||
74 | Gesundheit f\u00fcr alle is a PDF file that provides a guide to the | 74 | Gesundheit f\u00fcr alle is a PDF file that provides a guide to the | ||
75 | German healthcare system, and it is available in Spanish, English and | 75 | German healthcare system, and it is available in Spanish, English and | ||
76 | German. Processing: Two topics, which were shorter than 100 words, | 76 | German. Processing: Two topics, which were shorter than 100 words, | ||
77 | were merged with the next one to ensure that context is | 77 | were merged with the next one to ensure that context is | ||
78 | preserved.\r\n\r\n- Orphadata (INSERM): a comprehensive knowledge base | 78 | preserved.\r\n\r\n- Orphadata (INSERM): a comprehensive knowledge base | ||
79 | about rare diseases and orphan drugs, in re-usable and high-quality | 79 | about rare diseases and orphan drugs, in re-usable and high-quality | ||
80 | formats, released in 12 official EU languages. We gathered | 80 | formats, released in 12 official EU languages. We gathered | ||
81 | definitions, signs and symptoms and phenotypes about 4389 rare | 81 | definitions, signs and symptoms and phenotypes about 4389 rare | ||
82 | diseases in English, German, Spanish and French. Processing: Since | 82 | diseases in English, German, Spanish and French. Processing: Since | ||
83 | each definition is roughly the same size and similar format, we simply | 83 | each definition is roughly the same size and similar format, we simply | ||
84 | group 5 definitions together to make the text per topic | 84 | group 5 definitions together to make the text per topic | ||
85 | longer.\r\n\r\n- PubMed (National Library of Medicine): we downloaded | 85 | longer.\r\n\r\n- PubMed (National Library of Medicine): we downloaded | ||
86 | abstracts available in English, Spanish, French and German.\r\n\r\n- | 86 | abstracts available in English, Spanish, French and German.\r\n\r\n- | ||
87 | Wikipedia: a free, web-based, collaborative multilingual encyclopedia | 87 | Wikipedia: a free, web-based, collaborative multilingual encyclopedia | ||
88 | project; we selected (bio)medical contents available in English, | 88 | project; we selected (bio)medical contents available in English, | ||
89 | German, Spanish and French. To ensure that the texts were not | 89 | German, Spanish and French. To ensure that the texts were not | ||
90 | automatically generated, we only use articles that date back to before | 90 | automatically generated, we only use articles that date back to before | ||
91 | the release of ChatGPT, i.e. before 30th November 2022. Processing: | 91 | the release of ChatGPT, i.e. before 30th November 2022. Processing: | ||
92 | some data cleaning was necessary; we also removed all topics with less | 92 | some data cleaning was necessary; we also removed all topics with less | ||
93 | than 5 words, or split those with more than 9 sentences into equally | 93 | than 5 words, or split those with more than 9 sentences into equally | ||
94 | long parts. From these split up files, we make sure that they contain | 94 | long parts. From these split up files, we make sure that they contain | ||
95 | a minimum of 100 words, and we take only those contents or topics that | 95 | a minimum of 100 words, and we take only those contents or topics that | ||
96 | exist in all three languages.\r\n\r\n[Description of methods used for | 96 | exist in all three languages.\r\n\r\n[Description of methods used for | ||
97 | collection/generation of data] The corpus statistics and methods are | 97 | collection/generation of data] The corpus statistics and methods are | ||
98 | explained in the following article: Patrick Styll, Leonardo | 98 | explained in the following article: Patrick Styll, Leonardo | ||
99 | Campillos-Llanos, Jorge Fern\u00e1ndez-Garc\u00eda, Isabel | 99 | Campillos-Llanos, Jorge Fern\u00e1ndez-Garc\u00eda, Isabel | ||
100 | Segura-Bedmar (2025) \"MedAID-ML: A Multilingual Dataset of Biomedical | 100 | Segura-Bedmar (2025) \"MedAID-ML: A Multilingual Dataset of Biomedical | ||
101 | Texts for Detecting AI-Generated Content\". Under | 101 | Texts for Detecting AI-Generated Content\". Under | ||
102 | review.\r\n\r\n[Methods for processing the data] - Web-scraping of | 102 | review.\r\n\r\n[Methods for processing the data] - Web-scraping of | ||
103 | data from HTML content and PDF files available on the websites of the | 103 | data from HTML content and PDF files available on the websites of the | ||
104 | health contents.\r\n- Postprocessing and cleaning of data (e.g., | 104 | health contents.\r\n- Postprocessing and cleaning of data (e.g., | ||
105 | removal of redundant white spaces or line breaks), and homogeneization | 105 | removal of redundant white spaces or line breaks), and homogeneization | ||
106 | of text length.\r\n- Generation of corresponding contents by means of | 106 | of text length.\r\n- Generation of corresponding contents by means of | ||
107 | generative AI using three large language models: GPT-4o, Mistral-7B | 107 | generative AI using three large language models: GPT-4o, Mistral-7B | ||
108 | and Llama3-1.\r\n- Formating of contents into JSON | 108 | and Llama3-1.\r\n- Formating of contents into JSON | ||
109 | format.\r\n\r\n[Files] 1) JSON files: These are separated in TRAIN and | 109 | format.\r\n\r\n[Files] 1) JSON files: These are separated in TRAIN and | ||
110 | TEST. Each file has a list of hashes for each text, and each hash | 110 | TEST. Each file has a list of hashes for each text, and each hash | ||
111 | contains the following fields:\r\n \u2022 text: the textual | 111 | contains the following fields:\r\n \u2022 text: the textual | ||
112 | content.\r\n \u2022 data_source: the source repository of the | 112 | content.\r\n \u2022 data_source: the source repository of the | ||
113 | text.\r\n \u2022 filename: the name of the original file from which | 113 | text.\r\n \u2022 filename: the name of the original file from which | ||
114 | the data were sourced.\r\n \u2022 source: label indicating if it is a | 114 | the data were sourced.\r\n \u2022 source: label indicating if it is a | ||
115 | human-written text (HUMAN) or the LLM used to generate the text | 115 | human-written text (HUMAN) or the LLM used to generate the text | ||
116 | (\"gpt4o\", \"mistral\" or \"llama\").\r\n \u2022 \"language\": The | 116 | (\"gpt4o\", \"mistral\" or \"llama\").\r\n \u2022 \"language\": The | ||
117 | language code of the text: German (\"de\"), English (\"en\"), Spanish | 117 | language code of the text: German (\"de\"), English (\"en\"), Spanish | ||
118 | (\"es\") or French (\"fr\").\r\n \u2022 \"target\": a binary label to | 118 | (\"es\") or French (\"fr\").\r\n \u2022 \"target\": a binary label to | ||
119 | code if the text is written by humans (\"0\") or AI (\"1\").\r\n | 119 | code if the text is written by humans (\"0\") or AI (\"1\").\r\n | ||
120 | \u2022 \"ratio\": The proportion of the text that was created with AI: | 120 | \u2022 \"ratio\": The proportion of the text that was created with AI: | ||
121 | \"0.5\" for AI-generated texts, and \"null\" for human | 121 | \"0.5\" for AI-generated texts, and \"null\" for human | ||
122 | texts.\r\n\r\nThe corpus is made up of 13292 comparable and parallel | 122 | texts.\r\n\r\nThe corpus is made up of 13292 comparable and parallel | ||
123 | texts in four languages: German, English, Spanish and French. The | 123 | texts in four languages: German, English, Spanish and French. The | ||
124 | total token count is 3795449 tokens.\r\nThis resource is aimed at | 124 | total token count is 3795449 tokens.\r\nThis resource is aimed at | ||
125 | training and evaluating models to detect medical texts created by | 125 | training and evaluating models to detect medical texts created by | ||
126 | means of generative artificial intelligence.", | 126 | means of generative artificial intelligence.", | ||
127 | "es": "Este conjunto de datos se cre\u00f3 recopilando corpus de | 127 | "es": "Este conjunto de datos se cre\u00f3 recopilando corpus de | ||
128 | autor\u00eda humana de varios centros de salud p\u00fablica y | 128 | autor\u00eda humana de varios centros de salud p\u00fablica y | ||
129 | generando datos adicionales mediante tres LLM diferentes: GPT-4o, | 129 | generando datos adicionales mediante tres LLM diferentes: GPT-4o, | ||
130 | Mistral-7B y Llama3-1. Incluimos textos en ingl\u00e9s, espa\u00f1ol, | 130 | Mistral-7B y Llama3-1. Incluimos textos en ingl\u00e9s, espa\u00f1ol, | ||
131 | alem\u00e1n y franc\u00e9s del \u00e1mbito biom\u00e9dico. La | 131 | alem\u00e1n y franc\u00e9s del \u00e1mbito biom\u00e9dico. La | ||
132 | versi\u00f3n actual recopila un 50 % de textos generados por IA y un | 132 | versi\u00f3n actual recopila un 50 % de textos generados por IA y un | ||
133 | 50 % de textos escritos por humanos. A continuaci\u00f3n, se detallan | 133 | 50 % de textos escritos por humanos. A continuaci\u00f3n, se detallan | ||
134 | los datos utilizados:\r\n\r\nBiblioteca Cochrane: Base de datos de | 134 | los datos utilizados:\r\n\r\nBiblioteca Cochrane: Base de datos de | ||
135 | metaan\u00e1lisis y revisiones sistem\u00e1ticas con resultados | 135 | metaan\u00e1lisis y revisiones sistem\u00e1ticas con resultados | ||
136 | actualizados de estudios cl\u00ednicos. Se utilizaron res\u00famenes | 136 | actualizados de estudios cl\u00ednicos. Se utilizaron res\u00famenes | ||
137 | de revisiones sistem\u00e1ticas en los cuatro idiomas.\r\n\r\nEnsayos | 137 | de revisiones sistem\u00e1ticas en los cuatro idiomas.\r\n\r\nEnsayos | ||
138 | Cl\u00ednicos Europeos (EUCT): Esta agencia supervisa y eval\u00faa | 138 | Cl\u00ednicos Europeos (EUCT): Esta agencia supervisa y eval\u00faa | ||
139 | los productos farmac\u00e9uticos de la Uni\u00f3n Europea (UE). | 139 | los productos farmac\u00e9uticos de la Uni\u00f3n Europea (UE). | ||
140 | Descargamos datos paralelos de los informes p\u00fablicos de | 140 | Descargamos datos paralelos de los informes p\u00fablicos de | ||
141 | evaluaci\u00f3n (EPAR) de 12 nuevos medicamentos, as\u00ed como datos | 141 | evaluaci\u00f3n (EPAR) de 12 nuevos medicamentos, as\u00ed como datos | ||
142 | de los protocolos de ensayos cl\u00ednicos y los criterios de | 142 | de los protocolos de ensayos cl\u00ednicos y los criterios de | ||
143 | elegibilidad. Nos aseguramos de que los datos se publicaran | 143 | elegibilidad. Nos aseguramos de que los datos se publicaran | ||
144 | \u00fanicamente desde enero de 2025 hasta la fecha. El objetivo era | 144 | \u00fanicamente desde enero de 2025 hasta la fecha. El objetivo era | ||
145 | recopilar datos que podr\u00edan no haberse utilizado para entrenar a | 145 | recopilar datos que podr\u00edan no haberse utilizado para entrenar a | ||
146 | los LLM en nuestros experimentos.\r\n\r\nAgencia Europea de | 146 | los LLM en nuestros experimentos.\r\n\r\nAgencia Europea de | ||
147 | Medicamentos (EMA): Esta agencia supervisa y eval\u00faa los productos | 147 | Medicamentos (EMA): Esta agencia supervisa y eval\u00faa los productos | ||
148 | farmac\u00e9uticos de la Uni\u00f3n Europea (UE). Descargamos datos | 148 | farmac\u00e9uticos de la Uni\u00f3n Europea (UE). Descargamos datos | ||
149 | paralelos de los informes p\u00fablicos de evaluaci\u00f3n (EPAR) de | 149 | paralelos de los informes p\u00fablicos de evaluaci\u00f3n (EPAR) de | ||
150 | 12 nuevos medicamentos, as\u00ed como datos de los protocolos de | 150 | 12 nuevos medicamentos, as\u00ed como datos de los protocolos de | ||
151 | ensayos cl\u00ednicos y los criterios de elegibilidad. Nos aseguramos | 151 | ensayos cl\u00ednicos y los criterios de elegibilidad. Nos aseguramos | ||
152 | de que los datos se publicaran \u00fanicamente desde enero de 2025 | 152 | de que los datos se publicaran \u00fanicamente desde enero de 2025 | ||
153 | hasta la fecha. El objetivo era recopilar datos que podr\u00edan no | 153 | hasta la fecha. El objetivo era recopilar datos que podr\u00edan no | ||
154 | haberse utilizado para entrenar a los LLM en nuestros | 154 | haberse utilizado para entrenar a los LLM en nuestros | ||
155 | experimentos.\r\n\r\nAutoridad Europea de Seguridad Alimentaria | 155 | experimentos.\r\n\r\nAutoridad Europea de Seguridad Alimentaria | ||
156 | (AESA): Este sitio web ofrece una amplia gama de datos sobre consumo | 156 | (AESA): Este sitio web ofrece una amplia gama de datos sobre consumo | ||
157 | de alimentos y datos de control qu\u00edmico/biol\u00f3gico. | 157 | de alimentos y datos de control qu\u00edmico/biol\u00f3gico. | ||
158 | Seleccionamos \u00fanicamente los temas que consideramos necesarios | 158 | Seleccionamos \u00fanicamente los temas que consideramos necesarios | ||
159 | para nuestros objetivos, por lo que incluimos un total de 51. | 159 | para nuestros objetivos, por lo que incluimos un total de 51. | ||
160 | Procesamiento: dividimos manualmente los art\u00edculos con m\u00e1s | 160 | Procesamiento: dividimos manualmente los art\u00edculos con m\u00e1s | ||
161 | de 1350 palabras y verificamos manualmente su correcci\u00f3n y | 161 | de 1350 palabras y verificamos manualmente su correcci\u00f3n y | ||
162 | alineaci\u00f3n en todos los idiomas.\r\n\r\nPortal Europeo de | 162 | alineaci\u00f3n en todos los idiomas.\r\n\r\nPortal Europeo de | ||
163 | Informaci\u00f3n sobre Vacunaci\u00f3n (EVIP): Ofrece informaci\u00f3n | 163 | Informaci\u00f3n sobre Vacunaci\u00f3n (EVIP): Ofrece informaci\u00f3n | ||
164 | actualizada sobre vacunas y vacunaci\u00f3n. Las fichas informativas | 164 | actualizada sobre vacunas y vacunaci\u00f3n. Las fichas informativas | ||
165 | est\u00e1n disponibles en todos los idiomas y constan de 20 textos | 165 | est\u00e1n disponibles en todos los idiomas y constan de 20 textos | ||
166 | cada una.\r\n\r\nInmunizar: Immunize.org (anteriormente conocida como | 166 | cada una.\r\n\r\nInmunizar: Immunize.org (anteriormente conocida como | ||
167 | la Coalici\u00f3n de Acci\u00f3n para la Inmunizaci\u00f3n) es una | 167 | la Coalici\u00f3n de Acci\u00f3n para la Inmunizaci\u00f3n) es una | ||
168 | organizaci\u00f3n con sede en EE. UU. dedicada a proporcionar recursos | 168 | organizaci\u00f3n con sede en EE. UU. dedicada a proporcionar recursos | ||
169 | integrales de inmunizaci\u00f3n a profesionales de la salud y al | 169 | integrales de inmunizaci\u00f3n a profesionales de la salud y al | ||
170 | p\u00fablico en general. Las Hojas de Informaci\u00f3n sobre Vacunas | 170 | p\u00fablico en general. Las Hojas de Informaci\u00f3n sobre Vacunas | ||
171 | (VIS) se han traducido a varios idiomas, pero no todas contienen todas | 171 | (VIS) se han traducido a varios idiomas, pero no todas contienen todas | ||
172 | las VIS. Se ofrecen en formato PDF: 25 est\u00e1n en espa\u00f1ol, | 172 | las VIS. Se ofrecen en formato PDF: 25 est\u00e1n en espa\u00f1ol, | ||
173 | franc\u00e9s e ingl\u00e9s, pero solo 21 en alem\u00e1n. Solo se | 173 | franc\u00e9s e ingl\u00e9s, pero solo 21 en alem\u00e1n. Solo se | ||
174 | utilizaron PDF que coincid\u00edan en todos los | 174 | utilizaron PDF que coincid\u00edan en todos los | ||
175 | idiomas.\r\n\r\nMigraci\u00f3n y Salud - Ministerio de Salud de | 175 | idiomas.\r\n\r\nMigraci\u00f3n y Salud - Ministerio de Salud de | ||
176 | Alemania (BFG): Este portal ofrece informaci\u00f3n sanitaria | 176 | Alemania (BFG): Este portal ofrece informaci\u00f3n sanitaria | ||
177 | multiling\u00fce adaptada a migrantes y refugiados. \u00abSalud para | 177 | multiling\u00fce adaptada a migrantes y refugiados. \u00abSalud para | ||
178 | todos\u00bb es un archivo PDF que ofrece una gu\u00eda del sistema | 178 | todos\u00bb es un archivo PDF que ofrece una gu\u00eda del sistema | ||
179 | sanitario alem\u00e1n, disponible en espa\u00f1ol, ingl\u00e9s y | 179 | sanitario alem\u00e1n, disponible en espa\u00f1ol, ingl\u00e9s y | ||
180 | alem\u00e1n. Procesamiento: Dos temas, de menos de 100 palabras, se | 180 | alem\u00e1n. Procesamiento: Dos temas, de menos de 100 palabras, se | ||
181 | fusionaron con el siguiente para garantizar la conservaci\u00f3n del | 181 | fusionaron con el siguiente para garantizar la conservaci\u00f3n del | ||
182 | contexto.\r\n\r\nOrphadata (INSERM): una base de conocimiento completa | 182 | contexto.\r\n\r\nOrphadata (INSERM): una base de conocimiento completa | ||
183 | sobre enfermedades raras y medicamentos hu\u00e9rfanos, en formatos | 183 | sobre enfermedades raras y medicamentos hu\u00e9rfanos, en formatos | ||
184 | reutilizables y de alta calidad, disponible en 12 idiomas oficiales de | 184 | reutilizables y de alta calidad, disponible en 12 idiomas oficiales de | ||
185 | la UE. Recopilamos definiciones, signos y s\u00edntomas, y fenotipos | 185 | la UE. Recopilamos definiciones, signos y s\u00edntomas, y fenotipos | ||
186 | de 4389 enfermedades raras en ingl\u00e9s, alem\u00e1n, espa\u00f1ol y | 186 | de 4389 enfermedades raras en ingl\u00e9s, alem\u00e1n, espa\u00f1ol y | ||
187 | franc\u00e9s. Procesamiento: Dado que cada definici\u00f3n tiene | 187 | franc\u00e9s. Procesamiento: Dado que cada definici\u00f3n tiene | ||
188 | aproximadamente el mismo tama\u00f1o y formato, simplemente agrupamos | 188 | aproximadamente el mismo tama\u00f1o y formato, simplemente agrupamos | ||
189 | 5 definiciones para ampliar el texto por tema.\r\n\r\nPubMed | 189 | 5 definiciones para ampliar el texto por tema.\r\n\r\nPubMed | ||
190 | (Biblioteca Nacional de Medicina): descargamos res\u00famenes | 190 | (Biblioteca Nacional de Medicina): descargamos res\u00famenes | ||
191 | disponibles en ingl\u00e9s, espa\u00f1ol, franc\u00e9s y | 191 | disponibles en ingl\u00e9s, espa\u00f1ol, franc\u00e9s y | ||
192 | alem\u00e1n.\r\n\r\nWikipedia: un proyecto de enciclopedia | 192 | alem\u00e1n.\r\n\r\nWikipedia: un proyecto de enciclopedia | ||
193 | multiling\u00fce colaborativo, gratuito y basado en la web. | 193 | multiling\u00fce colaborativo, gratuito y basado en la web. | ||
194 | Seleccionamos contenido (bio)m\u00e9dico disponible en ingl\u00e9s, | 194 | Seleccionamos contenido (bio)m\u00e9dico disponible en ingl\u00e9s, | ||
195 | alem\u00e1n, espa\u00f1ol y franc\u00e9s. Para garantizar que los | 195 | alem\u00e1n, espa\u00f1ol y franc\u00e9s. Para garantizar que los | ||
196 | textos no se generaran autom\u00e1ticamente, solo utilizamos | 196 | textos no se generaran autom\u00e1ticamente, solo utilizamos | ||
197 | art\u00edculos anteriores al lanzamiento de ChatGPT, es decir, | 197 | art\u00edculos anteriores al lanzamiento de ChatGPT, es decir, | ||
198 | anteriores al 30 de noviembre de 2022. Procesamiento: fue necesaria | 198 | anteriores al 30 de noviembre de 2022. Procesamiento: fue necesaria | ||
199 | una limpieza de datos; tambi\u00e9n eliminamos todos los temas con | 199 | una limpieza de datos; tambi\u00e9n eliminamos todos los temas con | ||
200 | menos de 5 palabras o dividimos aquellos con m\u00e1s de 9 oraciones | 200 | menos de 5 palabras o dividimos aquellos con m\u00e1s de 9 oraciones | ||
201 | en partes de igual longitud. De estos archivos divididos, nos | 201 | en partes de igual longitud. De estos archivos divididos, nos | ||
202 | aseguramos de que contengan un m\u00ednimo de 100 palabras y solo | 202 | aseguramos de que contengan un m\u00ednimo de 100 palabras y solo | ||
203 | utilizamos los contenidos o temas disponibles en los tres | 203 | utilizamos los contenidos o temas disponibles en los tres | ||
204 | idiomas.\r\n\r\n[Descripci\u00f3n de los m\u00e9todos utilizados para | 204 | idiomas.\r\n\r\n[Descripci\u00f3n de los m\u00e9todos utilizados para | ||
205 | la recopilaci\u00f3n/generaci\u00f3n de datos] Las estad\u00edsticas y | 205 | la recopilaci\u00f3n/generaci\u00f3n de datos] Las estad\u00edsticas y | ||
206 | los m\u00e9todos del corpus se explican en el siguiente art\u00edculo: | 206 | los m\u00e9todos del corpus se explican en el siguiente art\u00edculo: | ||
207 | Patrick Styll, Leonardo Campillos-Llanos, Jorge | 207 | Patrick Styll, Leonardo Campillos-Llanos, Jorge | ||
208 | Fern\u00e1ndez-Garc\u00eda, Isabel Segura-Bedmar (2025) | 208 | Fern\u00e1ndez-Garc\u00eda, Isabel Segura-Bedmar (2025) | ||
209 | \u00abMedAID-ML: Un conjunto de datos multiling\u00fce de textos | 209 | \u00abMedAID-ML: Un conjunto de datos multiling\u00fce de textos | ||
210 | biom\u00e9dicos para la detecci\u00f3n de contenido generado por | 210 | biom\u00e9dicos para la detecci\u00f3n de contenido generado por | ||
211 | IA\u00bb. En revisi\u00f3n.\r\n\r\n[M\u00e9todos de procesamiento de | 211 | IA\u00bb. En revisi\u00f3n.\r\n\r\n[M\u00e9todos de procesamiento de | ||
212 | los datos] - Web-scraping de datos de contenido HTML y archivos PDF | 212 | los datos] - Web-scraping de datos de contenido HTML y archivos PDF | ||
213 | disponibles en los sitios web de contenidos de salud. - | 213 | disponibles en los sitios web de contenidos de salud. - | ||
214 | Postprocesamiento y limpieza de datos (por ejemplo, eliminaci\u00f3n | 214 | Postprocesamiento y limpieza de datos (por ejemplo, eliminaci\u00f3n | ||
215 | de espacios en blanco redundantes o saltos de l\u00ednea) y | 215 | de espacios en blanco redundantes o saltos de l\u00ednea) y | ||
216 | homogeneizaci\u00f3n de la longitud del texto. - Generaci\u00f3n de | 216 | homogeneizaci\u00f3n de la longitud del texto. - Generaci\u00f3n de | ||
217 | contenidos correspondientes mediante IA generativa utilizando tres | 217 | contenidos correspondientes mediante IA generativa utilizando tres | ||
218 | grandes modelos de lenguaje: GPT-4o, Mistral-7B y Llama3-1. - Formateo | 218 | grandes modelos de lenguaje: GPT-4o, Mistral-7B y Llama3-1. - Formateo | ||
219 | de contenidos en formato JSON.\r\n\r\n[Archivos] 1) Archivos JSON: Se | 219 | de contenidos en formato JSON.\r\n\r\n[Archivos] 1) Archivos JSON: Se | ||
220 | dividen en TRAIN y TEST. Cada archivo contiene una lista de hashes | 220 | dividen en TRAIN y TEST. Cada archivo contiene una lista de hashes | ||
221 | para cada texto, y cada hash contiene los siguientes campos: \u2022 | 221 | para cada texto, y cada hash contiene los siguientes campos: \u2022 | ||
222 | text: el contenido textual. \u2022 data_source: el repositorio fuente | 222 | text: el contenido textual. \u2022 data_source: el repositorio fuente | ||
223 | del texto. \u2022 filename: el nombre del archivo original del que se | 223 | del texto. \u2022 filename: el nombre del archivo original del que se | ||
224 | obtuvieron los datos. \u2022 source: etiqueta que indica si se trata | 224 | obtuvieron los datos. \u2022 source: etiqueta que indica si se trata | ||
225 | de un texto escrito por humanos (HUMAN) o del LLM utilizado para | 225 | de un texto escrito por humanos (HUMAN) o del LLM utilizado para | ||
226 | generarlo (\"gpt4o\", \"mistral\" o \"llama\"). \u2022 \"language\": | 226 | generarlo (\"gpt4o\", \"mistral\" o \"llama\"). \u2022 \"language\": | ||
227 | el c\u00f3digo de idioma del texto: alem\u00e1n (\"de\"), ingl\u00e9s | 227 | el c\u00f3digo de idioma del texto: alem\u00e1n (\"de\"), ingl\u00e9s | ||
228 | (\"en\"), espa\u00f1ol (\"es\") o franc\u00e9s (\"fr\"). \u2022 | 228 | (\"en\"), espa\u00f1ol (\"es\") o franc\u00e9s (\"fr\"). \u2022 | ||
229 | \"target\": una etiqueta binaria para indicar si el texto fue escrito | 229 | \"target\": una etiqueta binaria para indicar si el texto fue escrito | ||
230 | por humanos (\"0\") o por IA (\"1\"). \u2022 \"ratio\": la | 230 | por humanos (\"0\") o por IA (\"1\"). \u2022 \"ratio\": la | ||
231 | proporci\u00f3n del texto creado con IA: \"0,5\" para textos generados | 231 | proporci\u00f3n del texto creado con IA: \"0,5\" para textos generados | ||
232 | por IA y \"null\" para textos humanos.\r\n\r\nEl corpus consta de | 232 | por IA y \"null\" para textos humanos.\r\n\r\nEl corpus consta de | ||
233 | 13.292 textos comparables y paralelos en cuatro idiomas: alem\u00e1n, | 233 | 13.292 textos comparables y paralelos en cuatro idiomas: alem\u00e1n, | ||
234 | ingl\u00e9s, espa\u00f1ol y franc\u00e9s. El total de tokens es de | 234 | ingl\u00e9s, espa\u00f1ol y franc\u00e9s. El total de tokens es de | ||
235 | 3.795.449. Este recurso est\u00e1 destinado al entrenamiento y la | 235 | 3.795.449. Este recurso est\u00e1 destinado al entrenamiento y la | ||
236 | evaluaci\u00f3n de modelos para la detecci\u00f3n de textos | 236 | evaluaci\u00f3n de modelos para la detecci\u00f3n de textos | ||
237 | m\u00e9dicos creados mediante inteligencia artificial generativa." | 237 | m\u00e9dicos creados mediante inteligencia artificial generativa." | ||
238 | }, | 238 | }, | ||
239 | "groups": [], | 239 | "groups": [], | ||
240 | "id": "ade96985-70e0-41d8-b69c-003013a24503", | 240 | "id": "ade96985-70e0-41d8-b69c-003013a24503", | ||
241 | "identifier": "http://hdl.handle.net/10261/389309", | 241 | "identifier": "http://hdl.handle.net/10261/389309", | ||
242 | "instituto": [ | 242 | "instituto": [ | ||
243 | "Instituto de Lengua, Literatura y Antropolog\u00eda (ILLA), CSIC" | 243 | "Instituto de Lengua, Literatura y Antropolog\u00eda (ILLA), CSIC" | ||
244 | ], | 244 | ], | ||
245 | "international_spatial_translated": { | 245 | "international_spatial_translated": { | ||
246 | "en": "Europe", | 246 | "en": "Europe", | ||
247 | "es": "Europa" | 247 | "es": "Europa" | ||
248 | }, | 248 | }, | ||
249 | "isopen": false, | 249 | "isopen": false, | ||
250 | "issued_date": "2025-05-14T00:00:00", | 250 | "issued_date": "2025-05-14T00:00:00", | ||
251 | "language": [ | 251 | "language": [ | ||
252 | "es", | 252 | "es", | ||
253 | "en", | 253 | "en", | ||
254 | "fr" | 254 | "fr" | ||
255 | ], | 255 | ], | ||
256 | "license_id": "https://digital.csic.es/handle/10261/389309", | 256 | "license_id": "https://digital.csic.es/handle/10261/389309", | ||
257 | "license_title": "https://digital.csic.es/handle/10261/389309", | 257 | "license_title": "https://digital.csic.es/handle/10261/389309", | ||
258 | "maintainer": null, | 258 | "maintainer": null, | ||
259 | "maintainer_email": null, | 259 | "maintainer_email": null, | ||
260 | "metadata_created": "2025-09-09T05:15:08.878441", | 260 | "metadata_created": "2025-09-09T05:15:08.878441", | ||
n | 261 | "metadata_modified": "2025-09-09T05:35:49.834787", | n | 261 | "metadata_modified": "2025-09-09T05:36:05.594182", |
262 | "modified_date": "2025-09-09T07:15:08", | 262 | "modified_date": "2025-09-09T07:15:08", | ||
263 | "multilingual_tags": { | 263 | "multilingual_tags": { | ||
264 | "en": [ | 264 | "en": [ | ||
265 | "AI-generated Text", | 265 | "AI-generated Text", | ||
266 | "Generative AI", | 266 | "Generative AI", | ||
267 | "Biomedical natural language processing", | 267 | "Biomedical natural language processing", | ||
268 | "Biomedical corpus" | 268 | "Biomedical corpus" | ||
269 | ], | 269 | ], | ||
270 | "es": [ | 270 | "es": [ | ||
271 | "Textos generacos con IA", | 271 | "Textos generacos con IA", | ||
272 | "IA generativa", | 272 | "IA generativa", | ||
273 | "Procesamiento del Lenguaje Natural Biom\u00e9dico", | 273 | "Procesamiento del Lenguaje Natural Biom\u00e9dico", | ||
274 | "Corpus biom\u00e9dico" | 274 | "Corpus biom\u00e9dico" | ||
275 | ] | 275 | ] | ||
276 | }, | 276 | }, | ||
277 | "name": | 277 | "name": | ||
278 | telligence-text-detection-in-multilingual-settings-medaid-ml-dataset", | 278 | telligence-text-detection-in-multilingual-settings-medaid-ml-dataset", | ||
279 | "notes": null, | 279 | "notes": null, | ||
280 | "num_resources": 3, | 280 | "num_resources": 3, | ||
281 | "num_tags": 0, | 281 | "num_tags": 0, | ||
282 | "organization": { | 282 | "organization": { | ||
283 | "approval_status": "approved", | 283 | "approval_status": "approved", | ||
284 | "created": "2025-04-15T15:18:43.186369", | 284 | "created": "2025-04-15T15:18:43.186369", | ||
285 | "description": "Agencia Estatal Consejo Superior de | 285 | "description": "Agencia Estatal Consejo Superior de | ||
286 | Investigaciones Cient\u00edficas", | 286 | Investigaciones Cient\u00edficas", | ||
287 | "id": "e0fb512f-fa1e-43d9-ba1e-22b1f834702e", | 287 | "id": "e0fb512f-fa1e-43d9-ba1e-22b1f834702e", | ||
288 | "image_url": "2025-04-15-131843.179987csic.png", | 288 | "image_url": "2025-04-15-131843.179987csic.png", | ||
289 | "is_organization": true, | 289 | "is_organization": true, | ||
290 | "name": | 290 | "name": | ||
291 | agencia-estatal-consejo-superior-de-investigaciones-cientificas-csic", | 291 | agencia-estatal-consejo-superior-de-investigaciones-cientificas-csic", | ||
292 | "state": "active", | 292 | "state": "active", | ||
293 | "title": "CSIC", | 293 | "title": "CSIC", | ||
294 | "type": "organization" | 294 | "type": "organization" | ||
295 | }, | 295 | }, | ||
296 | "owner_org": "e0fb512f-fa1e-43d9-ba1e-22b1f834702e", | 296 | "owner_org": "e0fb512f-fa1e-43d9-ba1e-22b1f834702e", | ||
297 | "private": false, | 297 | "private": false, | ||
298 | "proyecto": {}, | 298 | "proyecto": {}, | ||
299 | "publisher": "b627d71d-2315-4e75-afc9-897da84459f0", | 299 | "publisher": "b627d71d-2315-4e75-afc9-897da84459f0", | ||
300 | "reference": [ | 300 | "reference": [ | ||
301 | "https://doi.org/10.1007/978-3-032-04354-2_5", | 301 | "https://doi.org/10.1007/978-3-032-04354-2_5", | ||
302 | "https://github.com/Padraig20/MedAID-ML", | 302 | "https://github.com/Padraig20/MedAID-ML", | ||
303 | "https://doi.org/10.20350/digitalCSIC/17276" | 303 | "https://doi.org/10.20350/digitalCSIC/17276" | ||
304 | ], | 304 | ], | ||
305 | "relationships_as_object": [], | 305 | "relationships_as_object": [], | ||
306 | "relationships_as_subject": [], | 306 | "relationships_as_subject": [], | ||
307 | "resources": [ | 307 | "resources": [ | ||
308 | { | 308 | { | ||
309 | "cache_last_updated": null, | 309 | "cache_last_updated": null, | ||
310 | "cache_url": null, | 310 | "cache_url": null, | ||
n | n | 311 | "created": "2025-09-09T05:15:08.882167", | ||
312 | "datastore_active": false, | ||||
313 | "description": null, | ||||
314 | "format": "txt", | ||||
315 | "hash": "", | ||||
316 | "id": "240d64f1-a51b-4893-aae1-944abe275335", | ||||
317 | "last_modified": null, | ||||
318 | "metadata_modified": "2025-09-09T05:36:05.597037", | ||||
319 | "mimetype": null, | ||||
320 | "mimetype_inner": null, | ||||
321 | "name": "README.txt", | ||||
322 | "name_translated": { | ||||
323 | "en": "README.txt", | ||||
324 | "es": "README.txt" | ||||
325 | }, | ||||
326 | "package_id": "ade96985-70e0-41d8-b69c-003013a24503", | ||||
327 | "position": 0, | ||||
328 | "resource_identifier": "", | ||||
329 | "resource_relation": [], | ||||
330 | "resource_type": null, | ||||
331 | "size": null, | ||||
332 | "state": "active", | ||||
333 | "url": | ||||
334 | "https://digital.csic.es/bitstream/10261/389309/6/README.txt", | ||||
335 | "url_type": null | ||||
336 | }, | ||||
337 | { | ||||
338 | "cache_last_updated": null, | ||||
339 | "cache_url": null, | ||||
311 | "created": "2025-09-09T05:15:08.882163", | 340 | "created": "2025-09-09T05:15:08.882163", | ||
312 | "datastore_active": false, | 341 | "datastore_active": false, | ||
313 | "description": null, | 342 | "description": null, | ||
314 | "format": "json", | 343 | "format": "json", | ||
315 | "hash": "", | 344 | "hash": "", | ||
316 | "id": "b79c78c7-4748-4f8f-9661-95d5cfe99125", | 345 | "id": "b79c78c7-4748-4f8f-9661-95d5cfe99125", | ||
317 | "last_modified": null, | 346 | "last_modified": null, | ||
n | 318 | "metadata_modified": "2025-09-09T05:15:08.876501", | n | 347 | "metadata_modified": "2025-09-09T05:36:05.597157", |
319 | "mimetype": null, | 348 | "mimetype": null, | ||
320 | "mimetype_inner": null, | 349 | "mimetype_inner": null, | ||
321 | "name": "dataset_test.json", | 350 | "name": "dataset_test.json", | ||
322 | "name_translated": { | 351 | "name_translated": { | ||
323 | "en": "dataset_test.json", | 352 | "en": "dataset_test.json", | ||
324 | "es": "dataset_test.json" | 353 | "es": "dataset_test.json" | ||
325 | }, | 354 | }, | ||
326 | "package_id": "ade96985-70e0-41d8-b69c-003013a24503", | 355 | "package_id": "ade96985-70e0-41d8-b69c-003013a24503", | ||
n | 327 | "position": 0, | n | 356 | "position": 1, |
328 | "resource_identifier": "", | 357 | "resource_identifier": "", | ||
329 | "resource_relation": [], | 358 | "resource_relation": [], | ||
330 | "resource_type": null, | 359 | "resource_type": null, | ||
331 | "size": null, | 360 | "size": null, | ||
332 | "state": "active", | 361 | "state": "active", | ||
333 | "url": | 362 | "url": | ||
334 | "https://digital.csic.es/bitstream/10261/389309/1/dataset_test.json", | 363 | "https://digital.csic.es/bitstream/10261/389309/1/dataset_test.json", | ||
335 | "url_type": null | 364 | "url_type": null | ||
336 | }, | 365 | }, | ||
337 | { | 366 | { | ||
338 | "cache_last_updated": null, | 367 | "cache_last_updated": null, | ||
339 | "cache_url": null, | 368 | "cache_url": null, | ||
340 | "created": "2025-09-09T05:15:08.882165", | 369 | "created": "2025-09-09T05:15:08.882165", | ||
341 | "datastore_active": false, | 370 | "datastore_active": false, | ||
342 | "description": null, | 371 | "description": null, | ||
343 | "format": "json", | 372 | "format": "json", | ||
344 | "hash": "", | 373 | "hash": "", | ||
345 | "id": "dee0c93c-9756-4efa-ab3b-38b21f604650", | 374 | "id": "dee0c93c-9756-4efa-ab3b-38b21f604650", | ||
346 | "last_modified": null, | 375 | "last_modified": null, | ||
n | 347 | "metadata_modified": "2025-09-09T05:15:08.876616", | n | 376 | "metadata_modified": "2025-09-09T05:36:05.597234", |
348 | "mimetype": null, | 377 | "mimetype": null, | ||
349 | "mimetype_inner": null, | 378 | "mimetype_inner": null, | ||
350 | "name": "dataset_train.json", | 379 | "name": "dataset_train.json", | ||
351 | "name_translated": { | 380 | "name_translated": { | ||
352 | "en": "dataset_train.json", | 381 | "en": "dataset_train.json", | ||
353 | "es": "dataset_train.json" | 382 | "es": "dataset_train.json" | ||
n | 354 | }, | n | ||
355 | "package_id": "ade96985-70e0-41d8-b69c-003013a24503", | ||||
356 | "position": 1, | ||||
357 | "resource_identifier": "", | ||||
358 | "resource_relation": [], | ||||
359 | "resource_type": null, | ||||
360 | "size": null, | ||||
361 | "state": "active", | ||||
362 | "url": | ||||
363 | "https://digital.csic.es/bitstream/10261/389309/2/dataset_train.json", | ||||
364 | "url_type": null | ||||
365 | }, | ||||
366 | { | ||||
367 | "cache_last_updated": null, | ||||
368 | "cache_url": null, | ||||
369 | "created": "2025-09-09T05:15:08.882167", | ||||
370 | "datastore_active": false, | ||||
371 | "description": null, | ||||
372 | "format": "txt", | ||||
373 | "hash": "", | ||||
374 | "id": "240d64f1-a51b-4893-aae1-944abe275335", | ||||
375 | "last_modified": null, | ||||
376 | "metadata_modified": "2025-09-09T05:15:08.876714", | ||||
377 | "mimetype": null, | ||||
378 | "mimetype_inner": null, | ||||
379 | "name": "README.txt", | ||||
380 | "name_translated": { | ||||
381 | "en": "README.txt", | ||||
382 | "es": "README.txt" | ||||
383 | }, | 383 | }, | ||
384 | "package_id": "ade96985-70e0-41d8-b69c-003013a24503", | 384 | "package_id": "ade96985-70e0-41d8-b69c-003013a24503", | ||
385 | "position": 2, | 385 | "position": 2, | ||
386 | "resource_identifier": "", | 386 | "resource_identifier": "", | ||
387 | "resource_relation": [], | 387 | "resource_relation": [], | ||
388 | "resource_type": null, | 388 | "resource_type": null, | ||
389 | "size": null, | 389 | "size": null, | ||
390 | "state": "active", | 390 | "state": "active", | ||
391 | "url": | 391 | "url": | ||
t | 392 | "https://digital.csic.es/bitstream/10261/389309/6/README.txt", | t | 392 | "https://digital.csic.es/bitstream/10261/389309/2/dataset_train.json", |
393 | "url_type": null | 393 | "url_type": null | ||
394 | } | 394 | } | ||
395 | ], | 395 | ], | ||
396 | "spatial": [], | 396 | "spatial": [], | ||
397 | "state": "active", | 397 | "state": "active", | ||
398 | "tags": [], | 398 | "tags": [], | ||
399 | "theme": [ | 399 | "theme": [ | ||
400 | 400 | ||||
401 | "http://datos.gob.es/kos/sector-publico/sector/ciencia-tecnologia", | 401 | "http://datos.gob.es/kos/sector-publico/sector/ciencia-tecnologia", | ||
402 | "http://datos.gob.es/kos/sector-publico/sector/salud" | 402 | "http://datos.gob.es/kos/sector-publico/sector/salud" | ||
403 | ], | 403 | ], | ||
404 | "title": "Medical Artificial Intelligence text Detection in | 404 | "title": "Medical Artificial Intelligence text Detection in | ||
405 | Multilingual settings (MedAID-ML)", | 405 | Multilingual settings (MedAID-ML)", | ||
406 | "title_translated": { | 406 | "title_translated": { | ||
407 | "en": "Medical Artificial Intelligence text Detection in | 407 | "en": "Medical Artificial Intelligence text Detection in | ||
408 | Multilingual settings (MedAID-ML)", | 408 | Multilingual settings (MedAID-ML)", | ||
409 | "es": "Medical Artificial Intelligence text Detection in | 409 | "es": "Medical Artificial Intelligence text Detection in | ||
410 | Multilingual settings (MedAID-ML)" | 410 | Multilingual settings (MedAID-ML)" | ||
411 | }, | 411 | }, | ||
412 | "type": "dataset", | 412 | "type": "dataset", | ||
413 | "url": null, | 413 | "url": null, | ||
414 | "version": null | 414 | "version": null | ||
415 | } | 415 | } |