{"id":"https://openalex.org/W7129031490","doi":"https://doi.org/10.48550/arxiv.2602.12414","title":"propella-1: Multi-Property Document Annotation for LLM Data Curation at Scale","display_name":"propella-1: Multi-Property Document Annotation for LLM Data Curation at Scale","publication_year":2026,"publication_date":"2026-02-12","ids":{"openalex":"https://openalex.org/W7129031490","doi":"https://doi.org/10.48550/arxiv.2602.12414"},"language":null,"primary_location":{"id":"pmh:doi:10.48550/arxiv.2602.12414","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":null,"any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5126076314","display_name":"Maximilian Idahl","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Idahl, Maximilian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5123021806","display_name":"Benedikt Droste","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Droste, Benedikt","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5126128721","display_name":"Bj\u00f6rn Pl\u00fcster","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Pl\u00fcster, Bj\u00f6rn","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5126126100","display_name":"Jan Philipp Harries","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Harries, Jan Philipp","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":4,"corresponding_author_ids":["https://openalex.org/A5126076314"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T14330","display_name":"Library Science and Information Systems","score":0.2012999951839447,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T14330","display_name":"Library Science and Information Systems","score":0.2012999951839447,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11710","display_name":"Biomedical Text Mining and Ontologies","score":0.19110000133514404,"subfield":{"id":"https://openalex.org/subfields/1312","display_name":"Molecular Biology"},"field":{"id":"https://openalex.org/fields/13","display_name":"Biochemistry, Genetics and Molecular Biology"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}},{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.08309999853372574,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/data-curation","display_name":"Data curation","score":0.7828999757766724},{"id":"https://openalex.org/keywords/json","display_name":"JSON","score":0.722100019454956},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.6589999794960022},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.4986000061035156},{"id":"https://openalex.org/keywords/scale","display_name":"Scale (ratio)","score":0.4277999997138977},{"id":"https://openalex.org/keywords/data-quality","display_name":"Data quality","score":0.3625999987125397},{"id":"https://openalex.org/keywords/metadata","display_name":"Metadata","score":0.3546999990940094},{"id":"https://openalex.org/keywords/interoperability","display_name":"Interoperability","score":0.33149999380111694}],"concepts":[{"id":"https://openalex.org/C91632574","wikidata":"https://www.wikidata.org/wiki/Q15088675","display_name":"Data curation","level":2,"score":0.7828999757766724},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.75},{"id":"https://openalex.org/C2780416260","wikidata":"https://www.wikidata.org/wiki/Q2063","display_name":"JSON","level":2,"score":0.722100019454956},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.6589999794960022},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6171000003814697},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.4986000061035156},{"id":"https://openalex.org/C2778755073","wikidata":"https://www.wikidata.org/wiki/Q10858537","display_name":"Scale (ratio)","level":2,"score":0.4277999997138977},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4138999879360199},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.3625999987125397},{"id":"https://openalex.org/C93518851","wikidata":"https://www.wikidata.org/wiki/Q180160","display_name":"Metadata","level":2,"score":0.3546999990940094},{"id":"https://openalex.org/C20136886","wikidata":"https://www.wikidata.org/wiki/Q749647","display_name":"Interoperability","level":2,"score":0.33149999380111694},{"id":"https://openalex.org/C62230096","wikidata":"https://www.wikidata.org/wiki/Q275969","display_name":"Crowdsourcing","level":2,"score":0.3253999948501587},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.3253999948501587},{"id":"https://openalex.org/C40231798","wikidata":"https://www.wikidata.org/wiki/Q1333743","display_name":"Composition (language)","level":2,"score":0.2969000041484833},{"id":"https://openalex.org/C9770341","wikidata":"https://www.wikidata.org/wiki/Q1938983","display_name":"Geospatial analysis","level":2,"score":0.2913999855518341},{"id":"https://openalex.org/C2778571376","wikidata":"https://www.wikidata.org/wiki/Q1355821","display_name":"Frontier","level":2,"score":0.2822999954223633},{"id":"https://openalex.org/C171686336","wikidata":"https://www.wikidata.org/wiki/Q3532085","display_name":"Topic model","level":2,"score":0.2759999930858612},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.2676999866962433},{"id":"https://openalex.org/C19875794","wikidata":"https://www.wikidata.org/wiki/Q1207340","display_name":"Item response theory","level":3,"score":0.2671000063419342},{"id":"https://openalex.org/C106347477","wikidata":"https://www.wikidata.org/wiki/Q5384228","display_name":"Equating","level":3,"score":0.26460000872612},{"id":"https://openalex.org/C8797682","wikidata":"https://www.wikidata.org/wiki/Q2115","display_name":"XML","level":2,"score":0.2621000111103058},{"id":"https://openalex.org/C67186912","wikidata":"https://www.wikidata.org/wiki/Q367664","display_name":"Data modeling","level":2,"score":0.2531000077724457}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:doi:10.48550/arxiv.2602.12414","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},{"id":"doi:10.48550/arxiv.2602.12414","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2602.12414","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:doi:10.48550/arxiv.2602.12414","is_oa":true,"landing_page_url":null,"pdf_url":null,"source":{"id":"https://openalex.org/S4406922384","display_name":"Open MIND","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"Article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Since":[0],"FineWeb-Edu,":[1],"data":[2,124],"curation":[3],"for":[4],"LLM":[5,92],"pretraining":[6,121,144],"has":[7],"predominantly":[8],"relied":[9],"on":[10],"single":[11,20],"scalar":[12],"quality":[13,24,60],"scores":[14],"produced":[15],"by":[16],"small":[17,39],"classifiers.":[18],"A":[19],"score":[21],"conflates":[22],"multiple":[23],"dimensions,":[25],"prevents":[26],"flexible":[27],"filtering,":[28],"and":[29,61,64,67,69,77,130,153,164],"offers":[30],"no":[31],"interpretability.":[32],"We":[33,108],"introduce":[34],"propella-1,":[35],"a":[36,84,89,94,111,137],"family":[37],"of":[38,113,141],"multilingual":[40],"LLMs":[41],"(0.6B,":[42],"1.7B,":[43],"4B":[44,98],"parameters)":[45],"that":[46,156],"annotate":[47],"text":[48],"documents":[49],"across":[50],"18":[51],"properties":[52],"organized":[53],"into":[54],"six":[55],"categories:":[56],"core":[57],"content,":[58],"classification,":[59],"value,":[62],"audience":[63],"purpose,":[65],"safety":[66],"compliance,":[68],"geographic":[70],"relevance.":[71],"The":[72],"models":[73],"support":[74],"57":[75],"languages":[76],"produce":[78],"structured":[79],"JSON":[80],"annotations":[81,118,165],"conforming":[82],"to":[83],"predefined":[85],"schema.":[86],"Evaluated":[87],"against":[88],"frontier":[90],"commercial":[91],"as":[93],"reference":[95],"annotator,":[96],"the":[97],"model":[99,162],"achieves":[100],"higher":[101],"agreement":[102],"than":[103],"much":[104],"larger":[105],"general-purpose":[106],"models.":[107],"release":[109],"propella-annotations,":[110],"dataset":[112],"over":[114],"three":[115],"billion":[116],"document":[117],"covering":[119],"major":[120],"corpora":[122],"including":[123],"from":[125],"FineWeb-2,":[126],"FinePDFs,":[127],"HPLT":[128],"3.0,":[129],"Nemotron-CC.":[131],"Using":[132],"these":[133],"annotations,":[134],"we":[135],"present":[136],"multi-dimensional":[138],"compositional":[139],"analysis":[140],"widely":[142],"used":[143],"datasets,":[145],"revealing":[146],"substantial":[147],"differences":[148],"in":[149],"quality,":[150],"reasoning":[151],"depth,":[152],"content":[154],"composition":[155],"single-score":[157],"approaches":[158],"cannot":[159],"capture.":[160],"All":[161],"weights":[163],"are":[166],"released":[167],"under":[168],"permissive,":[169],"commercial-use":[170],"licenses.":[171]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2026-02-17T00:00:00"}
