{"id":"https://openalex.org/W7152825326","doi":"https://doi.org/10.1145/3774904.3792597","title":"PaperAsk: A Benchmark for Reliability Evaluation of LLMs in Paper Search and Reading","display_name":"PaperAsk: A Benchmark for Reliability Evaluation of LLMs in Paper Search and Reading","publication_year":2026,"publication_date":"2026-04-09","ids":{"openalex":"https://openalex.org/W7152825326","doi":"https://doi.org/10.1145/3774904.3792597"},"language":null,"primary_location":{"id":"doi:10.1145/3774904.3792597","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3774904.3792597","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Web Conference 2026","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://doi.org/10.1145/3774904.3792597","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5004478523","display_name":"Yutao Wu","orcid":"https://orcid.org/0000-0001-8598-3437"},"institutions":[{"id":"https://openalex.org/I149704539","display_name":"Deakin University","ror":"https://ror.org/02czsnj07","country_code":"AU","type":"education","lineage":["https://openalex.org/I149704539"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Yutao Wu","raw_affiliation_strings":["Deakin University, Melbourne, Australia"],"raw_orcid":"https://orcid.org/0000-0001-8598-3437","affiliations":[{"raw_affiliation_string":"Deakin University, Melbourne, Australia","institution_ids":["https://openalex.org/I149704539"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Xiao Liu","orcid":"https://orcid.org/0000-0001-8400-5754"},"institutions":[{"id":"https://openalex.org/I149704539","display_name":"Deakin University","ror":"https://ror.org/02czsnj07","country_code":"AU","type":"education","lineage":["https://openalex.org/I149704539"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Xiao Liu","raw_affiliation_strings":["Deakin Universtiy, Melbourne, Australia"],"raw_orcid":"https://orcid.org/0000-0001-8400-5754","affiliations":[{"raw_affiliation_string":"Deakin Universtiy, Melbourne, Australia","institution_ids":["https://openalex.org/I149704539"]}]},{"author_position":"middle","author":{"id":null,"display_name":"Yunhao Feng","orcid":"https://orcid.org/0009-0002-9227-9717"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yunhao Feng","raw_affiliation_strings":["Fudan University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0002-9227-9717","affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5133259507","display_name":"Jiale Ding","orcid":"https://orcid.org/0009-0006-9238-8840"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jiale Ding","raw_affiliation_strings":["Fudan University, Shanghai, China"],"raw_orcid":"https://orcid.org/0009-0006-9238-8840","affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, China","institution_ids":["https://openalex.org/I24943067"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5078711649","display_name":"Xingjun Ma","orcid":"https://orcid.org/0000-0003-2099-4973"},"institutions":[{"id":"https://openalex.org/I24943067","display_name":"Fudan University","ror":"https://ror.org/013q1eq08","country_code":"CN","type":"education","lineage":["https://openalex.org/I24943067"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xingjun Ma","raw_affiliation_strings":["Fudan University, Shanghai, Australia"],"raw_orcid":"https://orcid.org/0000-0003-2099-4973","affiliations":[{"raw_affiliation_string":"Fudan University, Shanghai, Australia","institution_ids":["https://openalex.org/I24943067"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":5,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.57727911,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"2330","last_page":"2338"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.09120000153779984,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.09120000153779984,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13523","display_name":"Mathematics, Computing, and Information Processing","score":0.07699999958276749,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10102","display_name":"scientometrics and bibliometrics research","score":0.07580000162124634,"subfield":{"id":"https://openalex.org/subfields/1804","display_name":"Statistics, Probability and Uncertainty"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/reliability","display_name":"Reliability (semiconductor)","score":0.6970999836921692},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5879999995231628},{"id":"https://openalex.org/keywords/reading","display_name":"Reading (process)","score":0.49810001254081726},{"id":"https://openalex.org/keywords/key","display_name":"Key (lock)","score":0.32030001282691956},{"id":"https://openalex.org/keywords/grasp","display_name":"GRASP","score":0.2870999872684479}],"concepts":[{"id":"https://openalex.org/C43214815","wikidata":"https://www.wikidata.org/wiki/Q7310987","display_name":"Reliability (semiconductor)","level":3,"score":0.6970999836921692},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6172000169754028},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5879999995231628},{"id":"https://openalex.org/C554936623","wikidata":"https://www.wikidata.org/wiki/Q199657","display_name":"Reading (process)","level":2,"score":0.49810001254081726},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.42089998722076416},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.34779998660087585},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3379000127315521},{"id":"https://openalex.org/C42475967","wikidata":"https://www.wikidata.org/wiki/Q194292","display_name":"Operations research","level":1,"score":0.3287000060081482},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.32030001282691956},{"id":"https://openalex.org/C171268870","wikidata":"https://www.wikidata.org/wiki/Q1486676","display_name":"GRASP","level":2,"score":0.2870999872684479},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.27970001101493835},{"id":"https://openalex.org/C168167062","wikidata":"https://www.wikidata.org/wiki/Q1117970","display_name":"Component (thermodynamics)","level":2,"score":0.26809999346733093},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2671999931335449},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.2628999948501587},{"id":"https://openalex.org/C197947376","wikidata":"https://www.wikidata.org/wiki/Q5155608","display_name":"Comparability","level":2,"score":0.25780001282691956}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3774904.3792597","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3774904.3792597","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Web Conference 2026","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3774904.3792597","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3774904.3792597","pdf_url":null,"source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the ACM Web Conference 2026","raw_type":"proceedings-article"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.7776609659194946}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":false},"content_urls":null,"referenced_works_count":8,"referenced_works":["https://openalex.org/W2168190036","https://openalex.org/W2740900805","https://openalex.org/W4388886073","https://openalex.org/W4402264526","https://openalex.org/W4408615073","https://openalex.org/W4409150456","https://openalex.org/W4409505176","https://openalex.org/W4409657210"],"related_works":[],"abstract_inverted_index":{"Large":[0],"Language":[1],"Models":[2],"(LLMs)":[3],"increasingly":[4],"serve":[5],"as":[6],"research":[7,32],"assistants,":[8],"yet":[9],"their":[10],"reliability":[11,70,160,180],"in":[12,75,84],"scholarly":[13,184],"tasks":[14],"remains":[15],"under-evaluated.":[16],"In":[17],"this":[18],"work,":[19],"we":[20,67,157],"introduce":[21],"PaperAsk,":[22],"a":[23,172],"benchmark":[24,188],"that":[25],"systematically":[26],"evaluates":[27],"LLMs":[28,120,133],"across":[29],"four":[30],"key":[31],"tasks:":[33],"citation":[34,72],"retrieval,":[35],"content":[36,81],"extraction,":[37],"paper":[38,90],"discovery,":[39],"and":[40,47,88,116,174],"claim":[41],"verification.":[42],"We":[43],"evaluate":[44],"GPT-4o,":[45],"GPT-5,":[46],"Gemini-2.5-Flash":[48],"under":[49],"realistic":[50],"usage":[51],"conditions,":[52],"using":[53],"web":[54],"interfaces":[55],"where":[56],"search":[57],"operations":[58],"are":[59],"opaque":[60],"to":[61,109,121,166],"the":[62,110,117,132,179],"user.":[63],"Through":[64],"controlled":[65],"experiments,":[66],"find":[68],"consistent":[69],"failures:":[71],"retrieval":[73],"fails":[74,83],"48\u201398%":[76],"of":[77,86,100,113,119,182],"multi-reference":[78],"queries,":[79],"section-specific":[80],"extraction":[82],"72\u201391%":[85],"cases,":[87],"topical":[89],"discovery":[91],"yields":[92],"F1":[93],"scores":[94],"below":[95],"0.32,":[96],"missing":[97],"over":[98,126],"60%":[99],"relevant":[101,124],"literature.":[102],"Further":[103],"human":[104],"analysis":[105],"attributes":[106],"these":[107,155],"failures":[108],"uncontrolled":[111],"expansion":[112],"retrieved":[114],"context":[115],"tendency":[118],"prioritize":[122],"semantically":[123],"text":[125],"task":[127],"instructions.":[128],"Across":[129],"basic":[130],"tasks,":[131],"display":[134],"distinct":[135],"failure":[136],"behaviors:":[137],"ChatGPT":[138],"often":[139],"withholds":[140],"responses":[141],"rather":[142],"than":[143],"risk":[144],"errors,":[145],"whereas":[146],"Gemini":[147],"produces":[148],"fluent":[149],"but":[150],"fabricated":[151],"answers.":[152],"To":[153],"address":[154],"issues,":[156],"develop":[158],"lightweight":[159],"classifiers":[161],"trained":[162],"on":[163],"PaperAsk":[164,170],"data":[165],"identify":[167],"unreliable":[168],"outputs.":[169],"provides":[171],"reproducible":[173],"diagnostic":[175],"framework":[176],"for":[177],"advancing":[178],"evaluation":[181],"LLM-based":[183],"assistance":[185],"systems.":[186],"The":[187],"is":[189],"publicly":[190],"available":[191],"at":[192],"https://github.com/wuyoscar/PaperAsk.":[193]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-04-10T00:00:00"}
