{"id":"https://openalex.org/W7140216459","doi":"https://doi.org/10.48550/arxiv.2603.20990","title":"ECI: Effective Contrastive Information to Evaluate Hard-Negatives","display_name":"ECI: Effective Contrastive Information to Evaluate Hard-Negatives","publication_year":2026,"publication_date":"2026-03-22","ids":{"openalex":"https://openalex.org/W7140216459","doi":"https://doi.org/10.48550/arxiv.2603.20990"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2603.20990","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.20990","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2603.20990","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":null,"display_name":"Sinha, Aarush","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Sinha, Aarush","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":null,"display_name":"Seetharaman, Rahul","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Seetharaman, Rahul","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":null,"display_name":"Bansal, Aman","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Bansal, Aman","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.9336000084877014,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10286","display_name":"Information Retrieval and Search Behavior","score":0.9336000084877014,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.012299999594688416,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10918","display_name":"Memory Processes and Influences","score":0.007300000172108412,"subfield":{"id":"https://openalex.org/subfields/2805","display_name":"Cognitive Neuroscience"},"field":{"id":"https://openalex.org/fields/28","display_name":"Neuroscience"},"domain":{"id":"https://openalex.org/domains/1","display_name":"Life Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/discriminative-model","display_name":"Discriminative model","score":0.8073999881744385},{"id":"https://openalex.org/keywords/metric","display_name":"Metric (unit)","score":0.5687000155448914},{"id":"https://openalex.org/keywords/set","display_name":"Set (abstract data type)","score":0.5472000241279602},{"id":"https://openalex.org/keywords/heuristic","display_name":"Heuristic","score":0.5231000185012817},{"id":"https://openalex.org/keywords/mutual-information","display_name":"Mutual information","score":0.5142999887466431},{"id":"https://openalex.org/keywords/sampling","display_name":"Sampling (signal processing)","score":0.41670000553131104},{"id":"https://openalex.org/keywords/generative-grammar","display_name":"Generative grammar","score":0.39329999685287476},{"id":"https://openalex.org/keywords/interaction-information","display_name":"Interaction information","score":0.3903999924659729},{"id":"https://openalex.org/keywords/data-set","display_name":"Data set","score":0.3635999858379364}],"concepts":[{"id":"https://openalex.org/C97931131","wikidata":"https://www.wikidata.org/wiki/Q5282087","display_name":"Discriminative model","level":2,"score":0.8073999881744385},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6854000091552734},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.5687000155448914},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.5472000241279602},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5322999954223633},{"id":"https://openalex.org/C173801870","wikidata":"https://www.wikidata.org/wiki/Q201413","display_name":"Heuristic","level":2,"score":0.5231000185012817},{"id":"https://openalex.org/C152139883","wikidata":"https://www.wikidata.org/wiki/Q252973","display_name":"Mutual information","level":2,"score":0.5142999887466431},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4255000054836273},{"id":"https://openalex.org/C140779682","wikidata":"https://www.wikidata.org/wiki/Q210868","display_name":"Sampling (signal processing)","level":3,"score":0.41670000553131104},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.40450000762939453},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.39329999685287476},{"id":"https://openalex.org/C38764148","wikidata":"https://www.wikidata.org/wiki/Q17098245","display_name":"Interaction information","level":2,"score":0.3903999924659729},{"id":"https://openalex.org/C58489278","wikidata":"https://www.wikidata.org/wiki/Q1172284","display_name":"Data set","level":2,"score":0.3635999858379364},{"id":"https://openalex.org/C52622258","wikidata":"https://www.wikidata.org/wiki/Q131222","display_name":"Information theory","level":2,"score":0.36169999837875366},{"id":"https://openalex.org/C195807954","wikidata":"https://www.wikidata.org/wiki/Q1662562","display_name":"Information extraction","level":2,"score":0.3370000123977661},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3255999982357025},{"id":"https://openalex.org/C2776502983","wikidata":"https://www.wikidata.org/wiki/Q690182","display_name":"Contrast (vision)","level":2,"score":0.3249000012874603},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.3248000144958496},{"id":"https://openalex.org/C2779954242","wikidata":"https://www.wikidata.org/wiki/Q6031227","display_name":"Information structure","level":2,"score":0.31949999928474426},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.3098999857902527},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3061999976634979},{"id":"https://openalex.org/C112789634","wikidata":"https://www.wikidata.org/wiki/Q18207010","display_name":"False positives and false negatives","level":3,"score":0.3043999969959259},{"id":"https://openalex.org/C2779843651","wikidata":"https://www.wikidata.org/wiki/Q7390335","display_name":"SIGNAL (programming language)","level":2,"score":0.3019999861717224},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.2939999997615814},{"id":"https://openalex.org/C2780522230","wikidata":"https://www.wikidata.org/wiki/Q1140419","display_name":"Ambiguity","level":2,"score":0.29100000858306885},{"id":"https://openalex.org/C39927690","wikidata":"https://www.wikidata.org/wiki/Q11197","display_name":"Logarithm","level":2,"score":0.29030001163482666},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.2879999876022339},{"id":"https://openalex.org/C171686336","wikidata":"https://www.wikidata.org/wiki/Q3532085","display_name":"Topic model","level":2,"score":0.2639999985694885},{"id":"https://openalex.org/C45983554","wikidata":"https://www.wikidata.org/wiki/Q3412851","display_name":"Information quality","level":3,"score":0.263700008392334},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.25540000200271606}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2603.20990","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.20990","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2603.20990","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2603.20990","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/10","score":0.7371163368225098,"display_name":"Reduced inequalities"}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Hard":[0],"negatives":[1,37,89,96,135],"play":[2],"a":[3,68,117],"critical":[4],"role":[5],"in":[6,54,73,137],"training":[7],"and":[8,23,51,76,114,124,152,177],"fine-tuning":[9,45],"dense":[10],"retrieval":[11,31,164],"models,":[12],"as":[13],"they":[14],"are":[15],"semantically":[16],"similar":[17],"to":[18,83,91],"positive":[19],"documents":[20],"yet":[21],"non-relevant,":[22],"correctly":[24],"distinguishing":[25],"them":[26],"is":[27],"essential":[28],"for":[29,183],"improving":[30],"accuracy.":[32],"However,":[33],"identifying":[34,166],"effective":[35],"hard":[36,88],"typically":[38],"requires":[39],"extensive":[40],"ablation":[41,186],"studies":[42],"involving":[43],"repeated":[44],"with":[46],"different":[47],"negative":[48],"sampling":[49],"strategies":[50,169],"hyperparameters,":[52],"resulting":[53],"substantial":[55],"computational":[56],"cost.":[57],"In":[58],"this":[59],"paper,":[60],"we":[61],"introduce":[62],"ECI:":[63],"Effective":[64],"Contrastive":[65],"Information":[66,74,77,102],",":[67],"theoretically":[69],"grounded":[70,72],"metric":[71],"Theory":[75],"Retrieval":[78],"principles":[79],"that":[80,159,167],"enables":[81],"practitioners":[82],"assess":[84],"the":[85,99,104,172,181],"quality":[86],"of":[87,120,175],"prior":[90],"model":[92],"fine-tuning.":[93],"ECI":[94,130,142,160],"evaluates":[95],"by":[97,111],"optimizing":[98],"trade-off":[100],"between":[101],"Capacity":[103],"logarithmic":[105],"bound":[106],"on":[107],"mutual":[108],"information":[109],"determined":[110],"set":[112],"size":[113],"Discriminative":[115],"Efficiency,":[116],"harmonic":[118],"balance":[119,174],"Signal":[121],"Magnitude":[122],"(Hardness)":[123],"Safety":[125],"(Max-Margin).":[126],"Unlike":[127],"heuristic":[128],"approaches,":[129],"strictly":[131],"penalizes":[132],"unsafe,":[133],"false-positive":[134],"prevalent":[136],"generative":[138],"methods.":[139],"We":[140],"evaluate":[141],"across":[143],"hard-negative":[144],"sets":[145],"mined":[146],"or":[147],"generated":[148],"using":[149],"BM25,":[150],"cross-encoders,":[151],"large":[153],"language":[154],"models.":[155],"Our":[156],"results":[157],"demonstrate":[158],"accurately":[161],"predicts":[162],"downstream":[163],"performance,":[165],"hybrid":[168],"(BM25+Cross-Encoder)":[170],"offer":[171],"optimal":[173],"volume":[176],"reliability,":[178],"significantly":[179],"reducing":[180],"need":[182],"costly":[184],"end-to-end":[185],"studies.":[187]},"counts_by_year":[],"updated_date":"2026-04-25T08:17:42.794288","created_date":"2026-03-25T00:00:00"}
