Changes

On September 9, 2025 at 5:36:05 AM UTC, Administrador CKAN:
No fields were updated. See the metadata diff for more details.
              
    
          
          
        
        
            f 1 { f 1 {
            2   "Observaciones": { 2   "Observaciones": {
            3     "en": "Recommended citation : Styll, Patrick; Campillos-Llanos,  3     "en": "Recommended citation : Styll, Patrick; Campillos-Llanos, 
            4 Leonardo; 2025; Medical Artificial Intelligence text Detection in  4 Leonardo; 2025; Medical Artificial Intelligence text Detection in 
            5 Multilingual settings (MedAID-ML) [Dataset]; DIGITAL.CSIC;  5 Multilingual settings (MedAID-ML) [Dataset]; DIGITAL.CSIC; 
            6 https://doi.org/10.20350/digitalCSIC/17276", 6 https://doi.org/10.20350/digitalCSIC/17276",
            7     "es": "Cita recomendada: Styll, Patrick; Campillos-Llanos,  7     "es": "Cita recomendada: Styll, Patrick; Campillos-Llanos, 
            8 Leonardo; 2025; Medical Artificial Intelligence text Detection in  8 Leonardo; 2025; Medical Artificial Intelligence text Detection in 
            9 Multilingual settings (MedAID-ML) [Dataset]; DIGITAL.CSIC;  9 Multilingual settings (MedAID-ML) [Dataset]; DIGITAL.CSIC; 
            10 https://doi.org/10.20350/digitalCSIC/17276" 10 https://doi.org/10.20350/digitalCSIC/17276"
            11   }, 11   },
            12   "author": null, 12   "author": null,
            13   "author_email": null, 13   "author_email": null,
            14   "autor": { 14   "autor": {
            15     "en": [ 15     "en": [
            16       "Patrick Styll", 16       "Patrick Styll",
            17       "Leonardo Campillos-Llanos" 17       "Leonardo Campillos-Llanos"
            18     ], 18     ],
            19     "es": [ 19     "es": [
            20       "Patrick Styll", 20       "Patrick Styll",
            21       "Leonardo Campillos-Llanos" 21       "Leonardo Campillos-Llanos"
            22     ] 22     ]
            23   }, 23   },
            24   "conforms_to": [], 24   "conforms_to": [],
            25   "coverage_new": { 25   "coverage_new": {
            26     "1": { 26     "1": {
            27       "from": "2025-03-15T00:00:00", 27       "from": "2025-03-15T00:00:00",
            28       "to": "2025-03-15T00:00:00" 28       "to": "2025-03-15T00:00:00"
            29     } 29     }
            30   }, 30   },
            31   "creator_user_id": "196556b3-e0c4-4c51-a9e6-f51cc752bc37", 31   "creator_user_id": "196556b3-e0c4-4c51-a9e6-f51cc752bc37",
            32   "description": { 32   "description": {
            33     "en": "This dataset was created by gathering human-authored  33     "en": "This dataset was created by gathering human-authored 
            34 corpora from several public health sites and generating additional  34 corpora from several public health sites and generating additional 
            35 data via three different LLMs: GPT-4o, Mistral-7B and Llama3-1. We  35 data via three different LLMs: GPT-4o, Mistral-7B and Llama3-1. We 
            36 included texts in English, Spanish, German and French data from the  36 included texts in English, Spanish, German and French data from the 
            37 biomedical domain. The current version gathers 50% AI-generated and  37 biomedical domain. The current version gathers 50% AI-generated and 
            38 50% human-written texts. The following are the data we used:\r\n\r\n-  38 50% human-written texts. The following are the data we used:\r\n\r\n- 
            39 Cochrane Library: This is a database of meta-analyses and systematic  39 Cochrane Library: This is a database of meta-analyses and systematic 
            40 reviews of updated results of clinical studies. We used abstracts of  40 reviews of updated results of clinical studies. We used abstracts of 
            41 systematic reviews in all four languages.\r\n\r\n- European Clinical  41 systematic reviews in all four languages.\r\n\r\n- European Clinical 
            42 Trials (EUCT): This agency that supervises and evaluates  42 Trials (EUCT): This agency that supervises and evaluates 
            43 pharmaceutical products of the European Union (EU). We downloaded  43 pharmaceutical products of the European Union (EU). We downloaded 
            44 parallel data from public assessment reports (EPARs) from 12 new  44 parallel data from public assessment reports (EPARs) from 12 new 
            45 medicinal products, and data from clinical trial protocols and  45 medicinal products, and data from clinical trial protocols and 
            46 eligibility criteria. We ensured the data were published only from  46 eligibility criteria. We ensured the data were published only from 
            47 January 2025 to date. The goal was gathering data that might not have  47 January 2025 to date. The goal was gathering data that might not have 
            48 been used to train the LLMs in our experiments.\r\n\r\n- European  48 been used to train the LLMs in our experiments.\r\n\r\n- European 
            49 Medicines Agency (EMA): This agency that supervises and evaluates  49 Medicines Agency (EMA): This agency that supervises and evaluates 
            50 pharmaceutical products of the European Union (EU). We downloaded  50 pharmaceutical products of the European Union (EU). We downloaded 
            51 parallel data from public assessment reports (EPARs) from 12 new  51 parallel data from public assessment reports (EPARs) from 12 new 
            52 medicinal products, and data from clinical trial protocols and  52 medicinal products, and data from clinical trial protocols and 
            53 eligibility criteria. We ensured the data were published only from  53 eligibility criteria. We ensured the data were published only from 
            54 January 2025 to date. The goal was gathering data that might not have  54 January 2025 to date. The goal was gathering data that might not have 
            55 been used to train the LLMs in our experiments.\r\n\r\n- European Food  55 been used to train the LLMs in our experiments.\r\n\r\n- European Food 
            56 Safety Authority (EFSA): This website provides a comprehensive range  56 Safety Authority (EFSA): This website provides a comprehensive range 
            57 of data about food consumption and chemical/biological monitoring  57 of data about food consumption and chemical/biological monitoring 
            58 data. We chose only the topics we deem necessary for our goals,  58 data. We chose only the topics we deem necessary for our goals, 
            59 therefore including a total of 51 topics. Processing: we manually  59 therefore including a total of 51 topics. Processing: we manually 
            60 split articles with a wordcount of above 1350 and manually ensured  60 split articles with a wordcount of above 1350 and manually ensured 
            61 their correctness and alignment in all languages.\r\n\r\n- European  61 their correctness and alignment in all languages.\r\n\r\n- European 
            62 Vaccination Information Portal (EVIP): it provides up-to-date  62 Vaccination Information Portal (EVIP): it provides up-to-date 
            63 information on vaccines and vaccination. The factsheets are available  63 information on vaccines and vaccination. The factsheets are available 
            64 in all languages, and consist of 20 texts each.\r\n\r\n- Immunize:  64 in all languages, and consist of 20 texts each.\r\n\r\n- Immunize: 
            65 Immunize.org (formerly known as the Immunization Action Coalition) is  65 Immunize.org (formerly known as the Immunization Action Coalition) is 
            66 a U.S.-based organization dedicated to providing comprehensive  66 a U.S.-based organization dedicated to providing comprehensive 
            67 immunization resources for healthcare professionals and the public.  67 immunization resources for healthcare professionals and the public. 
            68 Vaccine Information Sheets (VISs) have been translated into several  68 Vaccine Information Sheets (VISs) have been translated into several 
            69 languages, but not all of them contain all VISs. They are given as  69 languages, but not all of them contain all VISs. They are given as 
            70 PDFs, with 25 in Spanish, French and English, but only 21 in German.  70 PDFs, with 25 in Spanish, French and English, but only 21 in German. 
            71 Only PDFs overlapping in all languages were used.\r\n\r\n- Migration  71 Only PDFs overlapping in all languages were used.\r\n\r\n- Migration 
            72 und Gesundheit - German Ministry of Health (BFG): This portal provides  72 und Gesundheit - German Ministry of Health (BFG): This portal provides 
            73 multilingual health information tailored for migrants and refugees.  73 multilingual health information tailored for migrants and refugees. 
            74 Gesundheit f\u00fcr alle is a PDF file that provides a guide to the  74 Gesundheit f\u00fcr alle is a PDF file that provides a guide to the 
            75 German healthcare system, and it is available in Spanish, English and  75 German healthcare system, and it is available in Spanish, English and 
            76 German. Processing: Two topics, which were shorter than 100 words,  76 German. Processing: Two topics, which were shorter than 100 words, 
            77 were merged with the next one to ensure that context is  77 were merged with the next one to ensure that context is 
            78 preserved.\r\n\r\n- Orphadata (INSERM): a comprehensive knowledge base  78 preserved.\r\n\r\n- Orphadata (INSERM): a comprehensive knowledge base 
            79 about rare diseases and orphan drugs, in re-usable and high-quality  79 about rare diseases and orphan drugs, in re-usable and high-quality 
            80 formats, released in 12 official EU languages. We gathered  80 formats, released in 12 official EU languages. We gathered 
            81 definitions, signs and symptoms and phenotypes about 4389 rare  81 definitions, signs and symptoms and phenotypes about 4389 rare 
            82 diseases in English, German, Spanish and French. Processing: Since  82 diseases in English, German, Spanish and French. Processing: Since 
            83 each definition is roughly the same size and similar format, we simply  83 each definition is roughly the same size and similar format, we simply 
            84 group 5 definitions together to make the text per topic  84 group 5 definitions together to make the text per topic 
            85 longer.\r\n\r\n- PubMed (National Library of Medicine): we downloaded  85 longer.\r\n\r\n- PubMed (National Library of Medicine): we downloaded 
            86 abstracts available in English, Spanish, French and German.\r\n\r\n-  86 abstracts available in English, Spanish, French and German.\r\n\r\n- 
            87 Wikipedia: a free, web-based, collaborative multilingual encyclopedia  87 Wikipedia: a free, web-based, collaborative multilingual encyclopedia 
            88 project; we selected (bio)medical contents available in English,  88 project; we selected (bio)medical contents available in English, 
            89 German, Spanish and French. To ensure that the texts were not  89 German, Spanish and French. To ensure that the texts were not 
            90 automatically generated, we only use articles that date back to before  90 automatically generated, we only use articles that date back to before 
            91 the release of ChatGPT, i.e. before 30th November 2022. Processing:  91 the release of ChatGPT, i.e. before 30th November 2022. Processing: 
            92 some data cleaning was necessary; we also removed all topics with less  92 some data cleaning was necessary; we also removed all topics with less 
            93 than 5 words, or split those with more than 9 sentences into equally  93 than 5 words, or split those with more than 9 sentences into equally 
            94 long parts. From these split up files, we make sure that they contain  94 long parts. From these split up files, we make sure that they contain 
            95 a minimum of 100 words, and we take only those contents or topics that  95 a minimum of 100 words, and we take only those contents or topics that 
            96 exist in all three languages.\r\n\r\n[Description of methods used for  96 exist in all three languages.\r\n\r\n[Description of methods used for 
            97 collection/generation of data] The corpus statistics and methods are  97 collection/generation of data] The corpus statistics and methods are 
            98 explained in the following article: Patrick Styll, Leonardo  98 explained in the following article: Patrick Styll, Leonardo 
            99 Campillos-Llanos, Jorge Fern\u00e1ndez-Garc\u00eda, Isabel  99 Campillos-Llanos, Jorge Fern\u00e1ndez-Garc\u00eda, Isabel 
            100 Segura-Bedmar (2025) \"MedAID-ML: A Multilingual Dataset of Biomedical  100 Segura-Bedmar (2025) \"MedAID-ML: A Multilingual Dataset of Biomedical 
            101 Texts for Detecting AI-Generated Content\". Under  101 Texts for Detecting AI-Generated Content\". Under 
            102 review.\r\n\r\n[Methods for processing the data] - Web-scraping of  102 review.\r\n\r\n[Methods for processing the data] - Web-scraping of 
            103 data from HTML content and PDF files available on the websites of the  103 data from HTML content and PDF files available on the websites of the 
            104 health contents.\r\n- Postprocessing and cleaning of data (e.g.,  104 health contents.\r\n- Postprocessing and cleaning of data (e.g., 
            105 removal of redundant white spaces or line breaks), and homogeneization  105 removal of redundant white spaces or line breaks), and homogeneization 
            106 of text length.\r\n- Generation of corresponding contents by means of  106 of text length.\r\n- Generation of corresponding contents by means of 
            107 generative AI using three large language models: GPT-4o, Mistral-7B  107 generative AI using three large language models: GPT-4o, Mistral-7B 
            108 and Llama3-1.\r\n- Formating of contents into JSON  108 and Llama3-1.\r\n- Formating of contents into JSON 
            109 format.\r\n\r\n[Files] 1) JSON files: These are separated in TRAIN and  109 format.\r\n\r\n[Files] 1) JSON files: These are separated in TRAIN and 
            110 TEST. Each file has a list of hashes for each text, and each hash  110 TEST. Each file has a list of hashes for each text, and each hash 
            111 contains the following fields:\r\n \u2022 text: the textual  111 contains the following fields:\r\n \u2022 text: the textual 
            112 content.\r\n \u2022 data_source: the source repository of the  112 content.\r\n \u2022 data_source: the source repository of the 
            113 text.\r\n \u2022 filename: the name of the original file from which  113 text.\r\n \u2022 filename: the name of the original file from which 
            114 the data were sourced.\r\n \u2022 source: label indicating if it is a  114 the data were sourced.\r\n \u2022 source: label indicating if it is a 
            115 human-written text (HUMAN) or the LLM used to generate the text  115 human-written text (HUMAN) or the LLM used to generate the text 
            116 (\"gpt4o\", \"mistral\" or \"llama\").\r\n \u2022 \"language\": The  116 (\"gpt4o\", \"mistral\" or \"llama\").\r\n \u2022 \"language\": The 
            117 language code of the text: German (\"de\"), English (\"en\"), Spanish  117 language code of the text: German (\"de\"), English (\"en\"), Spanish 
            118 (\"es\") or French (\"fr\").\r\n \u2022 \"target\": a binary label to  118 (\"es\") or French (\"fr\").\r\n \u2022 \"target\": a binary label to 
            119 code if the text is written by humans (\"0\") or AI (\"1\").\r\n  119 code if the text is written by humans (\"0\") or AI (\"1\").\r\n 
            120 \u2022 \"ratio\": The proportion of the text that was created with AI:  120 \u2022 \"ratio\": The proportion of the text that was created with AI: 
            121 \"0.5\" for AI-generated texts, and \"null\" for human  121 \"0.5\" for AI-generated texts, and \"null\" for human 
            122 texts.\r\n\r\nThe corpus is made up of 13292 comparable and parallel  122 texts.\r\n\r\nThe corpus is made up of 13292 comparable and parallel 
            123 texts in four languages: German, English, Spanish and French. The  123 texts in four languages: German, English, Spanish and French. The 
            124 total token count is 3795449 tokens.\r\nThis resource is aimed at  124 total token count is 3795449 tokens.\r\nThis resource is aimed at 
            125 training and evaluating models to detect medical texts created by  125 training and evaluating models to detect medical texts created by 
            126 means of generative artificial intelligence.", 126 means of generative artificial intelligence.",
            127     "es": "Este conjunto de datos se cre\u00f3 recopilando corpus de  127     "es": "Este conjunto de datos se cre\u00f3 recopilando corpus de 
            128 autor\u00eda humana de varios centros de salud p\u00fablica y  128 autor\u00eda humana de varios centros de salud p\u00fablica y 
            129 generando datos adicionales mediante tres LLM diferentes: GPT-4o,  129 generando datos adicionales mediante tres LLM diferentes: GPT-4o, 
            130 Mistral-7B y Llama3-1. Incluimos textos en ingl\u00e9s, espa\u00f1ol,  130 Mistral-7B y Llama3-1. Incluimos textos en ingl\u00e9s, espa\u00f1ol, 
            131 alem\u00e1n y franc\u00e9s del \u00e1mbito biom\u00e9dico. La  131 alem\u00e1n y franc\u00e9s del \u00e1mbito biom\u00e9dico. La 
            132 versi\u00f3n actual recopila un 50 % de textos generados por IA y un  132 versi\u00f3n actual recopila un 50 % de textos generados por IA y un 
            133 50 % de textos escritos por humanos. A continuaci\u00f3n, se detallan  133 50 % de textos escritos por humanos. A continuaci\u00f3n, se detallan 
            134 los datos utilizados:\r\n\r\nBiblioteca Cochrane: Base de datos de  134 los datos utilizados:\r\n\r\nBiblioteca Cochrane: Base de datos de 
            135 metaan\u00e1lisis y revisiones sistem\u00e1ticas con resultados  135 metaan\u00e1lisis y revisiones sistem\u00e1ticas con resultados 
            136 actualizados de estudios cl\u00ednicos. Se utilizaron res\u00famenes  136 actualizados de estudios cl\u00ednicos. Se utilizaron res\u00famenes 
            137 de revisiones sistem\u00e1ticas en los cuatro idiomas.\r\n\r\nEnsayos  137 de revisiones sistem\u00e1ticas en los cuatro idiomas.\r\n\r\nEnsayos 
            138 Cl\u00ednicos Europeos (EUCT): Esta agencia supervisa y eval\u00faa  138 Cl\u00ednicos Europeos (EUCT): Esta agencia supervisa y eval\u00faa 
            139 los productos farmac\u00e9uticos de la Uni\u00f3n Europea (UE).  139 los productos farmac\u00e9uticos de la Uni\u00f3n Europea (UE). 
            140 Descargamos datos paralelos de los informes p\u00fablicos de  140 Descargamos datos paralelos de los informes p\u00fablicos de 
            141 evaluaci\u00f3n (EPAR) de 12 nuevos medicamentos, as\u00ed como datos  141 evaluaci\u00f3n (EPAR) de 12 nuevos medicamentos, as\u00ed como datos 
            142 de los protocolos de ensayos cl\u00ednicos y los criterios de  142 de los protocolos de ensayos cl\u00ednicos y los criterios de 
            143 elegibilidad. Nos aseguramos de que los datos se publicaran  143 elegibilidad. Nos aseguramos de que los datos se publicaran 
            144 \u00fanicamente desde enero de 2025 hasta la fecha. El objetivo era  144 \u00fanicamente desde enero de 2025 hasta la fecha. El objetivo era 
            145 recopilar datos que podr\u00edan no haberse utilizado para entrenar a  145 recopilar datos que podr\u00edan no haberse utilizado para entrenar a 
            146 los LLM en nuestros experimentos.\r\n\r\nAgencia Europea de  146 los LLM en nuestros experimentos.\r\n\r\nAgencia Europea de 
            147 Medicamentos (EMA): Esta agencia supervisa y eval\u00faa los productos  147 Medicamentos (EMA): Esta agencia supervisa y eval\u00faa los productos 
            148 farmac\u00e9uticos de la Uni\u00f3n Europea (UE). Descargamos datos  148 farmac\u00e9uticos de la Uni\u00f3n Europea (UE). Descargamos datos 
            149 paralelos de los informes p\u00fablicos de evaluaci\u00f3n (EPAR) de  149 paralelos de los informes p\u00fablicos de evaluaci\u00f3n (EPAR) de 
            150 12 nuevos medicamentos, as\u00ed como datos de los protocolos de  150 12 nuevos medicamentos, as\u00ed como datos de los protocolos de 
            151 ensayos cl\u00ednicos y los criterios de elegibilidad. Nos aseguramos  151 ensayos cl\u00ednicos y los criterios de elegibilidad. Nos aseguramos 
            152 de que los datos se publicaran \u00fanicamente desde enero de 2025  152 de que los datos se publicaran \u00fanicamente desde enero de 2025 
            153 hasta la fecha. El objetivo era recopilar datos que podr\u00edan no  153 hasta la fecha. El objetivo era recopilar datos que podr\u00edan no 
            154 haberse utilizado para entrenar a los LLM en nuestros  154 haberse utilizado para entrenar a los LLM en nuestros 
            155 experimentos.\r\n\r\nAutoridad Europea de Seguridad Alimentaria  155 experimentos.\r\n\r\nAutoridad Europea de Seguridad Alimentaria 
            156 (AESA): Este sitio web ofrece una amplia gama de datos sobre consumo  156 (AESA): Este sitio web ofrece una amplia gama de datos sobre consumo 
            157 de alimentos y datos de control qu\u00edmico/biol\u00f3gico.  157 de alimentos y datos de control qu\u00edmico/biol\u00f3gico. 
            158 Seleccionamos \u00fanicamente los temas que consideramos necesarios  158 Seleccionamos \u00fanicamente los temas que consideramos necesarios 
            159 para nuestros objetivos, por lo que incluimos un total de 51.  159 para nuestros objetivos, por lo que incluimos un total de 51. 
            160 Procesamiento: dividimos manualmente los art\u00edculos con m\u00e1s  160 Procesamiento: dividimos manualmente los art\u00edculos con m\u00e1s 
            161 de 1350 palabras y verificamos manualmente su correcci\u00f3n y  161 de 1350 palabras y verificamos manualmente su correcci\u00f3n y 
            162 alineaci\u00f3n en todos los idiomas.\r\n\r\nPortal Europeo de  162 alineaci\u00f3n en todos los idiomas.\r\n\r\nPortal Europeo de 
            163 Informaci\u00f3n sobre Vacunaci\u00f3n (EVIP): Ofrece informaci\u00f3n  163 Informaci\u00f3n sobre Vacunaci\u00f3n (EVIP): Ofrece informaci\u00f3n 
            164 actualizada sobre vacunas y vacunaci\u00f3n. Las fichas informativas  164 actualizada sobre vacunas y vacunaci\u00f3n. Las fichas informativas 
            165 est\u00e1n disponibles en todos los idiomas y constan de 20 textos  165 est\u00e1n disponibles en todos los idiomas y constan de 20 textos 
            166 cada una.\r\n\r\nInmunizar: Immunize.org (anteriormente conocida como  166 cada una.\r\n\r\nInmunizar: Immunize.org (anteriormente conocida como 
            167 la Coalici\u00f3n de Acci\u00f3n para la Inmunizaci\u00f3n) es una  167 la Coalici\u00f3n de Acci\u00f3n para la Inmunizaci\u00f3n) es una 
            168 organizaci\u00f3n con sede en EE. UU. dedicada a proporcionar recursos  168 organizaci\u00f3n con sede en EE. UU. dedicada a proporcionar recursos 
            169 integrales de inmunizaci\u00f3n a profesionales de la salud y al  169 integrales de inmunizaci\u00f3n a profesionales de la salud y al 
            170 p\u00fablico en general. Las Hojas de Informaci\u00f3n sobre Vacunas  170 p\u00fablico en general. Las Hojas de Informaci\u00f3n sobre Vacunas 
            171 (VIS) se han traducido a varios idiomas, pero no todas contienen todas  171 (VIS) se han traducido a varios idiomas, pero no todas contienen todas 
            172 las VIS. Se ofrecen en formato PDF: 25 est\u00e1n en espa\u00f1ol,  172 las VIS. Se ofrecen en formato PDF: 25 est\u00e1n en espa\u00f1ol, 
            173 franc\u00e9s e ingl\u00e9s, pero solo 21 en alem\u00e1n. Solo se  173 franc\u00e9s e ingl\u00e9s, pero solo 21 en alem\u00e1n. Solo se 
            174 utilizaron PDF que coincid\u00edan en todos los  174 utilizaron PDF que coincid\u00edan en todos los 
            175 idiomas.\r\n\r\nMigraci\u00f3n y Salud - Ministerio de Salud de  175 idiomas.\r\n\r\nMigraci\u00f3n y Salud - Ministerio de Salud de 
            176 Alemania (BFG): Este portal ofrece informaci\u00f3n sanitaria  176 Alemania (BFG): Este portal ofrece informaci\u00f3n sanitaria 
            177 multiling\u00fce adaptada a migrantes y refugiados. \u00abSalud para  177 multiling\u00fce adaptada a migrantes y refugiados. \u00abSalud para 
            178 todos\u00bb es un archivo PDF que ofrece una gu\u00eda del sistema  178 todos\u00bb es un archivo PDF que ofrece una gu\u00eda del sistema 
            179 sanitario alem\u00e1n, disponible en espa\u00f1ol, ingl\u00e9s y  179 sanitario alem\u00e1n, disponible en espa\u00f1ol, ingl\u00e9s y 
            180 alem\u00e1n. Procesamiento: Dos temas, de menos de 100 palabras, se  180 alem\u00e1n. Procesamiento: Dos temas, de menos de 100 palabras, se 
            181 fusionaron con el siguiente para garantizar la conservaci\u00f3n del  181 fusionaron con el siguiente para garantizar la conservaci\u00f3n del 
            182 contexto.\r\n\r\nOrphadata (INSERM): una base de conocimiento completa  182 contexto.\r\n\r\nOrphadata (INSERM): una base de conocimiento completa 
            183 sobre enfermedades raras y medicamentos hu\u00e9rfanos, en formatos  183 sobre enfermedades raras y medicamentos hu\u00e9rfanos, en formatos 
            184 reutilizables y de alta calidad, disponible en 12 idiomas oficiales de  184 reutilizables y de alta calidad, disponible en 12 idiomas oficiales de 
            185 la UE. Recopilamos definiciones, signos y s\u00edntomas, y fenotipos  185 la UE. Recopilamos definiciones, signos y s\u00edntomas, y fenotipos 
            186 de 4389 enfermedades raras en ingl\u00e9s, alem\u00e1n, espa\u00f1ol y  186 de 4389 enfermedades raras en ingl\u00e9s, alem\u00e1n, espa\u00f1ol y 
            187 franc\u00e9s. Procesamiento: Dado que cada definici\u00f3n tiene  187 franc\u00e9s. Procesamiento: Dado que cada definici\u00f3n tiene 
            188 aproximadamente el mismo tama\u00f1o y formato, simplemente agrupamos  188 aproximadamente el mismo tama\u00f1o y formato, simplemente agrupamos 
            189 5 definiciones para ampliar el texto por tema.\r\n\r\nPubMed  189 5 definiciones para ampliar el texto por tema.\r\n\r\nPubMed 
            190 (Biblioteca Nacional de Medicina): descargamos res\u00famenes  190 (Biblioteca Nacional de Medicina): descargamos res\u00famenes 
            191 disponibles en ingl\u00e9s, espa\u00f1ol, franc\u00e9s y  191 disponibles en ingl\u00e9s, espa\u00f1ol, franc\u00e9s y 
            192 alem\u00e1n.\r\n\r\nWikipedia: un proyecto de enciclopedia  192 alem\u00e1n.\r\n\r\nWikipedia: un proyecto de enciclopedia 
            193 multiling\u00fce colaborativo, gratuito y basado en la web.  193 multiling\u00fce colaborativo, gratuito y basado en la web. 
            194 Seleccionamos contenido (bio)m\u00e9dico disponible en ingl\u00e9s,  194 Seleccionamos contenido (bio)m\u00e9dico disponible en ingl\u00e9s, 
            195 alem\u00e1n, espa\u00f1ol y franc\u00e9s. Para garantizar que los  195 alem\u00e1n, espa\u00f1ol y franc\u00e9s. Para garantizar que los 
            196 textos no se generaran autom\u00e1ticamente, solo utilizamos  196 textos no se generaran autom\u00e1ticamente, solo utilizamos 
            197 art\u00edculos anteriores al lanzamiento de ChatGPT, es decir,  197 art\u00edculos anteriores al lanzamiento de ChatGPT, es decir, 
            198 anteriores al 30 de noviembre de 2022. Procesamiento: fue necesaria  198 anteriores al 30 de noviembre de 2022. Procesamiento: fue necesaria 
            199 una limpieza de datos; tambi\u00e9n eliminamos todos los temas con  199 una limpieza de datos; tambi\u00e9n eliminamos todos los temas con 
            200 menos de 5 palabras o dividimos aquellos con m\u00e1s de 9 oraciones  200 menos de 5 palabras o dividimos aquellos con m\u00e1s de 9 oraciones 
            201 en partes de igual longitud. De estos archivos divididos, nos  201 en partes de igual longitud. De estos archivos divididos, nos 
            202 aseguramos de que contengan un m\u00ednimo de 100 palabras y solo  202 aseguramos de que contengan un m\u00ednimo de 100 palabras y solo 
            203 utilizamos los contenidos o temas disponibles en los tres  203 utilizamos los contenidos o temas disponibles en los tres 
            204 idiomas.\r\n\r\n[Descripci\u00f3n de los m\u00e9todos utilizados para  204 idiomas.\r\n\r\n[Descripci\u00f3n de los m\u00e9todos utilizados para 
            205 la recopilaci\u00f3n/generaci\u00f3n de datos] Las estad\u00edsticas y  205 la recopilaci\u00f3n/generaci\u00f3n de datos] Las estad\u00edsticas y 
            206 los m\u00e9todos del corpus se explican en el siguiente art\u00edculo:  206 los m\u00e9todos del corpus se explican en el siguiente art\u00edculo: 
            207 Patrick Styll, Leonardo Campillos-Llanos, Jorge  207 Patrick Styll, Leonardo Campillos-Llanos, Jorge 
            208 Fern\u00e1ndez-Garc\u00eda, Isabel Segura-Bedmar (2025)  208 Fern\u00e1ndez-Garc\u00eda, Isabel Segura-Bedmar (2025) 
            209 \u00abMedAID-ML: Un conjunto de datos multiling\u00fce de textos  209 \u00abMedAID-ML: Un conjunto de datos multiling\u00fce de textos 
            210 biom\u00e9dicos para la detecci\u00f3n de contenido generado por  210 biom\u00e9dicos para la detecci\u00f3n de contenido generado por 
            211 IA\u00bb. En revisi\u00f3n.\r\n\r\n[M\u00e9todos de procesamiento de  211 IA\u00bb. En revisi\u00f3n.\r\n\r\n[M\u00e9todos de procesamiento de 
            212 los datos] - Web-scraping de datos de contenido HTML y archivos PDF  212 los datos] - Web-scraping de datos de contenido HTML y archivos PDF 
            213 disponibles en los sitios web de contenidos de salud. -  213 disponibles en los sitios web de contenidos de salud. - 
            214 Postprocesamiento y limpieza de datos (por ejemplo, eliminaci\u00f3n  214 Postprocesamiento y limpieza de datos (por ejemplo, eliminaci\u00f3n 
            215 de espacios en blanco redundantes o saltos de l\u00ednea) y  215 de espacios en blanco redundantes o saltos de l\u00ednea) y 
            216 homogeneizaci\u00f3n de la longitud del texto. - Generaci\u00f3n de  216 homogeneizaci\u00f3n de la longitud del texto. - Generaci\u00f3n de 
            217 contenidos correspondientes mediante IA generativa utilizando tres  217 contenidos correspondientes mediante IA generativa utilizando tres 
            218 grandes modelos de lenguaje: GPT-4o, Mistral-7B y Llama3-1. - Formateo  218 grandes modelos de lenguaje: GPT-4o, Mistral-7B y Llama3-1. - Formateo 
            219 de contenidos en formato JSON.\r\n\r\n[Archivos] 1) Archivos JSON: Se  219 de contenidos en formato JSON.\r\n\r\n[Archivos] 1) Archivos JSON: Se 
            220 dividen en TRAIN y TEST. Cada archivo contiene una lista de hashes  220 dividen en TRAIN y TEST. Cada archivo contiene una lista de hashes 
            221 para cada texto, y cada hash contiene los siguientes campos: \u2022  221 para cada texto, y cada hash contiene los siguientes campos: \u2022 
            222 text: el contenido textual. \u2022 data_source: el repositorio fuente  222 text: el contenido textual. \u2022 data_source: el repositorio fuente 
            223 del texto. \u2022 filename: el nombre del archivo original del que se  223 del texto. \u2022 filename: el nombre del archivo original del que se 
            224 obtuvieron los datos. \u2022 source: etiqueta que indica si se trata  224 obtuvieron los datos. \u2022 source: etiqueta que indica si se trata 
            225 de un texto escrito por humanos (HUMAN) o del LLM utilizado para  225 de un texto escrito por humanos (HUMAN) o del LLM utilizado para 
            226 generarlo (\"gpt4o\", \"mistral\" o \"llama\"). \u2022 \"language\":  226 generarlo (\"gpt4o\", \"mistral\" o \"llama\"). \u2022 \"language\": 
            227 el c\u00f3digo de idioma del texto: alem\u00e1n (\"de\"), ingl\u00e9s  227 el c\u00f3digo de idioma del texto: alem\u00e1n (\"de\"), ingl\u00e9s 
            228 (\"en\"), espa\u00f1ol (\"es\") o franc\u00e9s (\"fr\"). \u2022  228 (\"en\"), espa\u00f1ol (\"es\") o franc\u00e9s (\"fr\"). \u2022 
            229 \"target\": una etiqueta binaria para indicar si el texto fue escrito  229 \"target\": una etiqueta binaria para indicar si el texto fue escrito 
            230 por humanos (\"0\") o por IA (\"1\"). \u2022 \"ratio\": la  230 por humanos (\"0\") o por IA (\"1\"). \u2022 \"ratio\": la 
            231 proporci\u00f3n del texto creado con IA: \"0,5\" para textos generados  231 proporci\u00f3n del texto creado con IA: \"0,5\" para textos generados 
            232 por IA y \"null\" para textos humanos.\r\n\r\nEl corpus consta de  232 por IA y \"null\" para textos humanos.\r\n\r\nEl corpus consta de 
            233 13.292 textos comparables y paralelos en cuatro idiomas: alem\u00e1n,  233 13.292 textos comparables y paralelos en cuatro idiomas: alem\u00e1n, 
            234 ingl\u00e9s, espa\u00f1ol y franc\u00e9s. El total de tokens es de  234 ingl\u00e9s, espa\u00f1ol y franc\u00e9s. El total de tokens es de 
            235 3.795.449. Este recurso est\u00e1 destinado al entrenamiento y la  235 3.795.449. Este recurso est\u00e1 destinado al entrenamiento y la 
            236 evaluaci\u00f3n de modelos para la detecci\u00f3n de textos  236 evaluaci\u00f3n de modelos para la detecci\u00f3n de textos 
            237 m\u00e9dicos creados mediante inteligencia artificial generativa." 237 m\u00e9dicos creados mediante inteligencia artificial generativa."
            238   }, 238   },
            239   "groups": [], 239   "groups": [],
            240   "id": "ade96985-70e0-41d8-b69c-003013a24503", 240   "id": "ade96985-70e0-41d8-b69c-003013a24503",
            241   "identifier": "http://hdl.handle.net/10261/389309", 241   "identifier": "http://hdl.handle.net/10261/389309",
            242   "instituto": [ 242   "instituto": [
            243     "Instituto de Lengua, Literatura y Antropolog\u00eda (ILLA), CSIC" 243     "Instituto de Lengua, Literatura y Antropolog\u00eda (ILLA), CSIC"
            244   ], 244   ],
            245   "international_spatial_translated": { 245   "international_spatial_translated": {
            246     "en": "Europe", 246     "en": "Europe",
            247     "es": "Europa" 247     "es": "Europa"
            248   }, 248   },
            249   "isopen": false, 249   "isopen": false,
            250   "issued_date": "2025-05-14T00:00:00", 250   "issued_date": "2025-05-14T00:00:00",
            251   "language": [ 251   "language": [
            252     "es", 252     "es",
            253     "en", 253     "en",
            254     "fr" 254     "fr"
            255   ], 255   ],
            256   "license_id": "https://digital.csic.es/handle/10261/389309", 256   "license_id": "https://digital.csic.es/handle/10261/389309",
            257   "license_title": "https://digital.csic.es/handle/10261/389309", 257   "license_title": "https://digital.csic.es/handle/10261/389309",
            258   "maintainer": null, 258   "maintainer": null,
            259   "maintainer_email": null, 259   "maintainer_email": null,
            260   "metadata_created": "2025-09-09T05:15:08.878441", 260   "metadata_created": "2025-09-09T05:15:08.878441",
            n 261   "metadata_modified": "2025-09-09T05:35:49.834787", n 261   "metadata_modified": "2025-09-09T05:36:05.594182",
            262   "modified_date": "2025-09-09T07:15:08", 262   "modified_date": "2025-09-09T07:15:08",
            263   "multilingual_tags": { 263   "multilingual_tags": {
            264     "en": [ 264     "en": [
            265       "AI-generated Text", 265       "AI-generated Text",
            266       "Generative AI", 266       "Generative AI",
            267       "Biomedical natural language processing", 267       "Biomedical natural language processing",
            268       "Biomedical corpus" 268       "Biomedical corpus"
            269     ], 269     ],
            270     "es": [ 270     "es": [
            271       "Textos generacos con IA", 271       "Textos generacos con IA",
            272       "IA generativa", 272       "IA generativa",
            273       "Procesamiento del Lenguaje Natural Biom\u00e9dico", 273       "Procesamiento del Lenguaje Natural Biom\u00e9dico",
            274       "Corpus biom\u00e9dico" 274       "Corpus biom\u00e9dico"
            275     ] 275     ]
            276   }, 276   },
            277   "name":  277   "name": 
            278 telligence-text-detection-in-multilingual-settings-medaid-ml-dataset", 278 telligence-text-detection-in-multilingual-settings-medaid-ml-dataset",
            279   "notes": null, 279   "notes": null,
            280   "num_resources": 3, 280   "num_resources": 3,
            281   "num_tags": 0, 281   "num_tags": 0,
            282   "organization": { 282   "organization": {
            283     "approval_status": "approved", 283     "approval_status": "approved",
            284     "created": "2025-04-15T15:18:43.186369", 284     "created": "2025-04-15T15:18:43.186369",
            285     "description": "Agencia Estatal Consejo Superior de  285     "description": "Agencia Estatal Consejo Superior de 
            286 Investigaciones Cient\u00edficas", 286 Investigaciones Cient\u00edficas",
            287     "id": "e0fb512f-fa1e-43d9-ba1e-22b1f834702e", 287     "id": "e0fb512f-fa1e-43d9-ba1e-22b1f834702e",
            288     "image_url": "2025-04-15-131843.179987csic.png", 288     "image_url": "2025-04-15-131843.179987csic.png",
            289     "is_organization": true, 289     "is_organization": true,
            290     "name":  290     "name": 
            291 agencia-estatal-consejo-superior-de-investigaciones-cientificas-csic", 291 agencia-estatal-consejo-superior-de-investigaciones-cientificas-csic",
            292     "state": "active", 292     "state": "active",
            293     "title": "CSIC", 293     "title": "CSIC",
            294     "type": "organization" 294     "type": "organization"
            295   }, 295   },
            296   "owner_org": "e0fb512f-fa1e-43d9-ba1e-22b1f834702e", 296   "owner_org": "e0fb512f-fa1e-43d9-ba1e-22b1f834702e",
            297   "private": false, 297   "private": false,
            298   "proyecto": {}, 298   "proyecto": {},
            299   "publisher": "b627d71d-2315-4e75-afc9-897da84459f0", 299   "publisher": "b627d71d-2315-4e75-afc9-897da84459f0",
            300   "reference": [ 300   "reference": [
            301     "https://doi.org/10.1007/978-3-032-04354-2_5", 301     "https://doi.org/10.1007/978-3-032-04354-2_5",
            302     "https://github.com/Padraig20/MedAID-ML", 302     "https://github.com/Padraig20/MedAID-ML",
            303     "https://doi.org/10.20350/digitalCSIC/17276" 303     "https://doi.org/10.20350/digitalCSIC/17276"
            304   ], 304   ],
            305   "relationships_as_object": [], 305   "relationships_as_object": [],
            306   "relationships_as_subject": [], 306   "relationships_as_subject": [],
            307   "resources": [ 307   "resources": [
            308     { 308     {
            309       "cache_last_updated": null, 309       "cache_last_updated": null,
            310       "cache_url": null, 310       "cache_url": null,
            n n 311       "created": "2025-09-09T05:15:08.882167",
            312       "datastore_active": false,
            313       "description": null,
            314       "format": "txt",
            315       "hash": "",
            316       "id": "240d64f1-a51b-4893-aae1-944abe275335",
            317       "last_modified": null,
            318       "metadata_modified": "2025-09-09T05:36:05.597037",
            319       "mimetype": null,
            320       "mimetype_inner": null,
            321       "name": "README.txt",
            322       "name_translated": {
            323         "en": "README.txt",
            324         "es": "README.txt"
            325       },
            326       "package_id": "ade96985-70e0-41d8-b69c-003013a24503",
            327       "position": 0,
            328       "resource_identifier": "",
            329       "resource_relation": [],
            330       "resource_type": null,
            331       "size": null,
            332       "state": "active",
            333       "url": 
            334 "https://digital.csic.es/bitstream/10261/389309/6/README.txt",
            335       "url_type": null
            336     },
            337     {
            338       "cache_last_updated": null,
            339       "cache_url": null,
            311       "created": "2025-09-09T05:15:08.882163", 340       "created": "2025-09-09T05:15:08.882163",
            312       "datastore_active": false, 341       "datastore_active": false,
            313       "description": null, 342       "description": null,
            314       "format": "json", 343       "format": "json",
            315       "hash": "", 344       "hash": "",
            316       "id": "b79c78c7-4748-4f8f-9661-95d5cfe99125", 345       "id": "b79c78c7-4748-4f8f-9661-95d5cfe99125",
            317       "last_modified": null, 346       "last_modified": null,
            n 318       "metadata_modified": "2025-09-09T05:15:08.876501", n 347       "metadata_modified": "2025-09-09T05:36:05.597157",
            319       "mimetype": null, 348       "mimetype": null,
            320       "mimetype_inner": null, 349       "mimetype_inner": null,
            321       "name": "dataset_test.json", 350       "name": "dataset_test.json",
            322       "name_translated": { 351       "name_translated": {
            323         "en": "dataset_test.json", 352         "en": "dataset_test.json",
            324         "es": "dataset_test.json" 353         "es": "dataset_test.json"
            325       }, 354       },
            326       "package_id": "ade96985-70e0-41d8-b69c-003013a24503", 355       "package_id": "ade96985-70e0-41d8-b69c-003013a24503",
            n 327       "position": 0, n 356       "position": 1,
            328       "resource_identifier": "", 357       "resource_identifier": "",
            329       "resource_relation": [], 358       "resource_relation": [],
            330       "resource_type": null, 359       "resource_type": null,
            331       "size": null, 360       "size": null,
            332       "state": "active", 361       "state": "active",
            333       "url":  362       "url": 
            334 "https://digital.csic.es/bitstream/10261/389309/1/dataset_test.json", 363 "https://digital.csic.es/bitstream/10261/389309/1/dataset_test.json",
            335       "url_type": null 364       "url_type": null
            336     }, 365     },
            337     { 366     {
            338       "cache_last_updated": null, 367       "cache_last_updated": null,
            339       "cache_url": null, 368       "cache_url": null,
            340       "created": "2025-09-09T05:15:08.882165", 369       "created": "2025-09-09T05:15:08.882165",
            341       "datastore_active": false, 370       "datastore_active": false,
            342       "description": null, 371       "description": null,
            343       "format": "json", 372       "format": "json",
            344       "hash": "", 373       "hash": "",
            345       "id": "dee0c93c-9756-4efa-ab3b-38b21f604650", 374       "id": "dee0c93c-9756-4efa-ab3b-38b21f604650",
            346       "last_modified": null, 375       "last_modified": null,
            n 347       "metadata_modified": "2025-09-09T05:15:08.876616", n 376       "metadata_modified": "2025-09-09T05:36:05.597234",
            348       "mimetype": null, 377       "mimetype": null,
            349       "mimetype_inner": null, 378       "mimetype_inner": null,
            350       "name": "dataset_train.json", 379       "name": "dataset_train.json",
            351       "name_translated": { 380       "name_translated": {
            352         "en": "dataset_train.json", 381         "en": "dataset_train.json",
            353         "es": "dataset_train.json" 382         "es": "dataset_train.json"
            n 354       }, n 
            355       "package_id": "ade96985-70e0-41d8-b69c-003013a24503", 
            356       "position": 1, 
            357       "resource_identifier": "", 
            358       "resource_relation": [], 
            359       "resource_type": null, 
            360       "size": null, 
            361       "state": "active", 
            362       "url":  
            363 "https://digital.csic.es/bitstream/10261/389309/2/dataset_train.json", 
            364       "url_type": null 
            365     }, 
            366     { 
            367       "cache_last_updated": null, 
            368       "cache_url": null, 
            369       "created": "2025-09-09T05:15:08.882167", 
            370       "datastore_active": false, 
            371       "description": null, 
            372       "format": "txt", 
            373       "hash": "", 
            374       "id": "240d64f1-a51b-4893-aae1-944abe275335", 
            375       "last_modified": null, 
            376       "metadata_modified": "2025-09-09T05:15:08.876714", 
            377       "mimetype": null, 
            378       "mimetype_inner": null, 
            379       "name": "README.txt", 
            380       "name_translated": { 
            381         "en": "README.txt", 
            382         "es": "README.txt" 
            383       }, 383       },
            384       "package_id": "ade96985-70e0-41d8-b69c-003013a24503", 384       "package_id": "ade96985-70e0-41d8-b69c-003013a24503",
            385       "position": 2, 385       "position": 2,
            386       "resource_identifier": "", 386       "resource_identifier": "",
            387       "resource_relation": [], 387       "resource_relation": [],
            388       "resource_type": null, 388       "resource_type": null,
            389       "size": null, 389       "size": null,
            390       "state": "active", 390       "state": "active",
            391       "url":  391       "url": 
            t 392 "https://digital.csic.es/bitstream/10261/389309/6/README.txt", t 392 "https://digital.csic.es/bitstream/10261/389309/2/dataset_train.json",
            393       "url_type": null 393       "url_type": null
            394     } 394     }
            395   ], 395   ],
            396   "spatial": [], 396   "spatial": [],
            397   "state": "active", 397   "state": "active",
            398   "tags": [], 398   "tags": [],
            399   "theme": [ 399   "theme": [
            400      400     
            401 "http://datos.gob.es/kos/sector-publico/sector/ciencia-tecnologia", 401 "http://datos.gob.es/kos/sector-publico/sector/ciencia-tecnologia",
            402     "http://datos.gob.es/kos/sector-publico/sector/salud" 402     "http://datos.gob.es/kos/sector-publico/sector/salud"
            403   ], 403   ],
            404   "title": "Medical Artificial Intelligence text Detection in  404   "title": "Medical Artificial Intelligence text Detection in 
            405 Multilingual settings (MedAID-ML)", 405 Multilingual settings (MedAID-ML)",
            406   "title_translated": { 406   "title_translated": {
            407     "en": "Medical Artificial Intelligence text Detection in  407     "en": "Medical Artificial Intelligence text Detection in 
            408 Multilingual settings (MedAID-ML)", 408 Multilingual settings (MedAID-ML)",
            409     "es": "Medical Artificial Intelligence text Detection in  409     "es": "Medical Artificial Intelligence text Detection in 
            410 Multilingual settings (MedAID-ML)" 410 Multilingual settings (MedAID-ML)"
            411   }, 411   },
            412   "type": "dataset", 412   "type": "dataset",
            413   "url": null, 413   "url": null,
            414   "version": null 414   "version": null
            415 } 415 }