{"id":"https://openalex.org/W6968357975","doi":"https://doi.org/10.5281/zenodo.13739396","title":"ChatNoir Resiliparse","display_name":"ChatNoir Resiliparse","publication_year":2024,"publication_date":"2024-09-10","ids":{"openalex":"https://openalex.org/W6968357975","doi":"https://doi.org/10.5281/zenodo.13739396"},"language":"en","primary_location":{"id":"doi:10.5281/zenodo.13739396","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.13739396","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"other","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.5281/zenodo.13739396","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Bevendorff, Janek","orcid":"https://orcid.org/0000-0002-3797-0559"},"institutions":[{"id":"https://openalex.org/I51441396","display_name":"Bauhaus-Universit\u00e4t Weimar","ror":"https://ror.org/033bb5z47","country_code":"DE","type":"education","lineage":["https://openalex.org/I51441396"]}],"countries":["DE"],"is_corresponding":true,"raw_author_name":"Bevendorff, Janek","raw_affiliation_strings":["Bauhaus-Universit\u00e4t Weimar"],"affiliations":[{"raw_affiliation_string":"Bauhaus-Universit\u00e4t Weimar","institution_ids":["https://openalex.org/I51441396"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Stein, Benno","orcid":"https://orcid.org/0000-0001-9033-2217"},"institutions":[{"id":"https://openalex.org/I51441396","display_name":"Bauhaus-Universit\u00e4t Weimar","ror":"https://ror.org/033bb5z47","country_code":"DE","type":"education","lineage":["https://openalex.org/I51441396"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Stein, Benno","raw_affiliation_strings":["Bauhaus-Universit\u00e4t Weimar"],"affiliations":[{"raw_affiliation_string":"Bauhaus-Universit\u00e4t Weimar","institution_ids":["https://openalex.org/I51441396"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Hagen, Matthias","orcid":"https://orcid.org/0000-0002-9733-2890"},"institutions":[{"id":"https://openalex.org/I76198965","display_name":"Friedrich Schiller University Jena","ror":"https://ror.org/05qpz1x62","country_code":"DE","type":"education","lineage":["https://openalex.org/I76198965"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Hagen, Matthias","raw_affiliation_strings":["Friedrich-Schiller-Universit\u00e4t Jena"],"affiliations":[{"raw_affiliation_string":"Friedrich-Schiller-Universit\u00e4t Jena","institution_ids":["https://openalex.org/I76198965"]}]},{"author_position":"last","author":{"id":null,"display_name":"Potthast, Martin","orcid":"https://orcid.org/0000-0003-2451-0665"},"institutions":[{"id":"https://openalex.org/I926574661","display_name":"Leipzig University","ror":"https://ror.org/03s7gtk40","country_code":"DE","type":"education","lineage":["https://openalex.org/I926574661"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Potthast, Martin","raw_affiliation_strings":["Leipzig University and ScaDS.AI"],"affiliations":[{"raw_affiliation_string":"Leipzig University and ScaDS.AI","institution_ids":["https://openalex.org/I926574661"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":["https://openalex.org/I51441396"],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":true,"primary_topic":null,"topics":[],"keywords":[{"id":"https://openalex.org/keywords/terabyte","display_name":"Terabyte","score":0.900600016117096},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.5540000200271606},{"id":"https://openalex.org/keywords/petabyte","display_name":"Petabyte","score":0.482699990272522},{"id":"https://openalex.org/keywords/web-page","display_name":"Web page","score":0.4138999879360199},{"id":"https://openalex.org/keywords/web-application","display_name":"Web application","score":0.3961000144481659},{"id":"https://openalex.org/keywords/interface","display_name":"Interface (matter)","score":0.36970001459121704},{"id":"https://openalex.org/keywords/web-search-engine","display_name":"Web search engine","score":0.3634999990463257},{"id":"https://openalex.org/keywords/web-modeling","display_name":"Web modeling","score":0.35249999165534973},{"id":"https://openalex.org/keywords/web-service","display_name":"Web service","score":0.33880001306533813}],"concepts":[{"id":"https://openalex.org/C199683683","wikidata":"https://www.wikidata.org/wiki/Q8799","display_name":"Terabyte","level":2,"score":0.900600016117096},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7753000259399414},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.5540000200271606},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.552299976348877},{"id":"https://openalex.org/C13600138","wikidata":"https://www.wikidata.org/wiki/Q8799","display_name":"Petabyte","level":3,"score":0.482699990272522},{"id":"https://openalex.org/C21959979","wikidata":"https://www.wikidata.org/wiki/Q36774","display_name":"Web page","level":2,"score":0.4138999879360199},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.40540000796318054},{"id":"https://openalex.org/C118643609","wikidata":"https://www.wikidata.org/wiki/Q189210","display_name":"Web application","level":2,"score":0.3961000144481659},{"id":"https://openalex.org/C113843644","wikidata":"https://www.wikidata.org/wiki/Q901882","display_name":"Interface (matter)","level":4,"score":0.36970001459121704},{"id":"https://openalex.org/C521815418","wikidata":"https://www.wikidata.org/wiki/Q4182287","display_name":"Web search engine","level":4,"score":0.3634999990463257},{"id":"https://openalex.org/C130436687","wikidata":"https://www.wikidata.org/wiki/Q7978591","display_name":"Web modeling","level":3,"score":0.35249999165534973},{"id":"https://openalex.org/C35578498","wikidata":"https://www.wikidata.org/wiki/Q193424","display_name":"Web service","level":2,"score":0.33880001306533813},{"id":"https://openalex.org/C97854310","wikidata":"https://www.wikidata.org/wiki/Q19541","display_name":"Search engine","level":2,"score":0.3264000117778778},{"id":"https://openalex.org/C13743948","wikidata":"https://www.wikidata.org/wiki/Q45842","display_name":"Web crawler","level":2,"score":0.3208000063896179},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.32010000944137573},{"id":"https://openalex.org/C110875604","wikidata":"https://www.wikidata.org/wiki/Q75","display_name":"The Internet","level":2,"score":0.3174999952316284},{"id":"https://openalex.org/C162005631","wikidata":"https://www.wikidata.org/wiki/Q54837","display_name":"Data Web","level":3,"score":0.3142000138759613},{"id":"https://openalex.org/C89505385","wikidata":"https://www.wikidata.org/wiki/Q47146","display_name":"User interface","level":2,"score":0.3131999969482422},{"id":"https://openalex.org/C79373723","wikidata":"https://www.wikidata.org/wiki/Q386275","display_name":"Web development","level":3,"score":0.31060001254081726},{"id":"https://openalex.org/C99613125","wikidata":"https://www.wikidata.org/wiki/Q165194","display_name":"Application programming interface","level":2,"score":0.30160000920295715},{"id":"https://openalex.org/C127613066","wikidata":"https://www.wikidata.org/wiki/Q557770","display_name":"Web API","level":4,"score":0.2761000096797943},{"id":"https://openalex.org/C79158427","wikidata":"https://www.wikidata.org/wiki/Q485396","display_name":"Analytics","level":2,"score":0.26510000228881836},{"id":"https://openalex.org/C173979980","wikidata":"https://www.wikidata.org/wiki/Q114106","display_name":"Metasearch engine","level":4,"score":0.2623000144958496},{"id":"https://openalex.org/C173576120","wikidata":"https://www.wikidata.org/wiki/Q2641220","display_name":"Static web page","level":4,"score":0.259799987077713},{"id":"https://openalex.org/C138827492","wikidata":"https://www.wikidata.org/wiki/Q6661985","display_name":"Data processing","level":2,"score":0.25780001282691956}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5281/zenodo.13739396","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.13739396","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.5281/zenodo.13739396","is_oa":true,"landing_page_url":"https://doi.org/10.5281/zenodo.13739396","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"This":[0],"package":[1],"contains":[2],"ChatNoir":[3,23,25,57,333],"Resiliparse,":[4],"a":[5,32,75,95,111,122,152,177,263,342],"collection":[6],"of":[7,82,147,179,183,197,244,252,280,336],"robust":[8],"and":[9,15,42,101,119,137,154,186,254,288,297,319,362,365,368,389,392,395,424,447,450,468,471,474],"fast":[10],"processing":[11,145,218,267],"tools":[12,233,338],"for":[13,37,97,104,124,173,190,215,269,294,316,341,421],"parsing":[14],"analyzing":[16,255],"web":[17,50,130,140,148,198,219],"archive":[18],"data.":[19],"Paper":[20,132],"Abstract":[21,133],"Elastic":[22,24,56,87,290,312],"is":[26,79,91],"an":[27],"Elasticsearch-based":[28],"search":[29,35,65,70,136],"engine":[30],"offering":[31],"freely":[33],"accessible":[34],"interface":[36],"the":[38,43,105,162,170,212,222,249,295,298,317,320,422,425],"two":[39],"ClueWeb":[40,296,318,423],"corpora":[41],"Common":[44,299,321,426],"Crawl,":[45,300,322],"together":[46],"about":[47],"3":[48],"billion":[49],"pages.":[51],"Running":[52],"across":[53],"130":[54],"nodes,":[55],"features":[58],"subsecond":[59],"response":[60],"times":[61],"comparable":[62],"to":[63,84,92,115,120,129,194,260,284,308,347],"commercial":[64,69],"engines.":[66],"Unlike":[67],"most":[68],"engines,":[71],"it":[72,188],"also":[73,438],"offers":[74],"powerful":[76],"API":[77],"that":[78],"available":[80],"free":[81],"charge":[83],"IR":[85,99,381],"researchers.":[86],"ChatNoir\u2019s":[88],"main":[89],"purpose":[90],"serve":[93],"as":[94,210],"baseline":[96],"reproducible":[98],"experiments":[100],"user":[102],"studies":[103],"coming":[106],"years,":[107],"empowering":[108],"research":[109],"at":[110,221],"scale":[112],"not":[113],"attainable":[114],"many":[116,206],"labs":[117],"beforehand,":[118],"provide":[121],"platform":[123],"experimenting":[125],"with":[126],"new":[127,264],"approaches":[128],"search.":[131],"FastWARC":[134],"Web":[135,164,304,326,496],"other":[138],"large-scale":[139],"data":[141,199],"analytics":[142],"rely":[143],"on":[144,286,310,380,458],"archives":[146,220],"pages":[149],"stored":[150],"in":[151,160,205,231,238,272,374,408],"standardized":[153],"efficient":[155],"format.":[156],"Since":[157],"its":[158,256,337],"introduction":[159],"2008,":[161],"IIPC's":[163],"ARCive":[165],"(WARC)":[166],"format":[167,172],"has":[168],"become":[169],"standard":[171,214],"this":[174,440],"purpose.":[175],"As":[176],"list":[178],"individually":[180],"compressed":[181],"records":[182],"HTTP":[184],"requests":[185],"responses,":[187],"allows":[189],"constant-time":[191],"random":[192],"access":[193],"all":[195],"kinds":[196],"via":[200],"off-the-shelf":[201],"open":[202],"source":[203],"parsers":[204],"programming":[207],"languages,":[208],"such":[209],"WARCIO,":[211],"de-facto":[213],"Python.":[216],"When":[217],"terabyte":[223],"or":[224,241,334],"petabyte":[225],"scale,":[226],"however,":[227],"even":[228,242],"small":[229],"inefficiencies":[230],"these":[232],"add":[234],"up":[235],"quickly,":[236],"resulting":[237],"hours,":[239],"days,":[240],"weeks":[243],"wasted":[245],"compute":[246],"time.":[247],"Reviewing":[248],"basic":[250],"components":[251],"WARCIO":[253],"bottlenecks,":[257],"we":[258],"proceed":[259],"build":[261],"FastWARC,":[262,435],"high-performance":[265],"WARC":[266],"library":[268],"Python,":[270],"written":[271],"C++/Cython,":[273],"which":[274],"yields":[275],"performance":[276],"improvements":[277],"by":[278],"factors":[279],"1.6-8x.":[281],"Links":[282],"Link":[283,307],"papers":[285,309],"Springer":[287],"ArXiv:":[289],"ChatNoir:":[291,313,418],"Search":[292,314,419,460,484],"Engine":[293,315,420],"FastWARC:":[301,323],"Optimizing":[302,324,494],"Large-Scale":[303,325,495],"Archive":[305,327,497],"Analytics":[306,328],"Webis:":[311],"Citation":[329],"If":[330,432],"you":[331,433,436],"use":[332,434],"any":[335],"(like":[339],"Resiliparse)":[340],"publication,":[343],"please":[344],"be":[345],"sure":[346],"cite":[348,439],"our":[349],"paper:":[350,441],"@InProceedings{bevendorff:2018,":[351],"address":[352],"=":[353,359,372,386,399,402,405,412,416,429,444,454,465,478,481,487,492,500],"{Berlin":[354],"Heidelberg":[355],"New":[356],"York},":[357],"author":[358,443],"{Janek":[360,445],"Bevendorff":[361,446],"Benno":[363,451],"Stein":[364],"Matthias":[366],"Hagen":[367],"Martin":[369,448],"Potthast},":[370],"booktitle":[371,453],"{Advances":[373],"Information":[375],"Retrieval.":[376],"40th":[377],"European":[378],"Conference":[379],"Research":[382],"(ECIR":[383],"2018)},":[384],"editor":[385,464],"{Leif":[387],"Azzopardi":[388],"Allan":[390],"Hanbury":[391],"Gabriella":[393],"Pasi":[394],"Benjamin":[396],"Piwowarski},":[397],"month":[398,477],"mar,":[400],"publisher":[401,480],"{Springer},":[403],"series":[404],"{Lecture":[406],"Notes":[407],"Computer":[409],"Science},":[410],"site":[411,486],"{Grenoble,":[413],"France},":[414],"title":[415,491],"{{Elastic":[417],"Crawl}},":[427],"year":[428,499],"2018":[430],"}":[431,502],"can":[437],"@InProceedings{bevendorff:2021,":[442],"Potthast":[449],"Stein},":[452],"{3rd":[455],"International":[456],"Symposium":[457],"Open":[459,483],"Technology":[461],"(OSSYM":[462],"2021)},":[463],"{Andreas":[466],"Wagner":[467],"Christian":[469],"Guetl":[470],"Michael":[472],"Granitzer":[473],"Stefan":[475],"Voigt},":[476],"oct,":[479],"{International":[482],"Symposium},":[485],"{CERN,":[488],"Geneva,":[489],"Switzerland},":[490],"{{FastWARC:":[493],"Analytics}},":[498],"2021":[501]},"counts_by_year":[],"updated_date":"2025-11-06T06:51:31.235846","created_date":"2025-10-10T00:00:00"}
