{"id":"https://openalex.org/W2086904543","doi":"https://doi.org/10.3115/1117794.1117809","title":"Empirical term weighting and expansion frequency","display_name":"Empirical term weighting and expansion frequency","publication_year":2000,"publication_date":"2000-01-01","ids":{"openalex":"https://openalex.org/W2086904543","doi":"https://doi.org/10.3115/1117794.1117809","mag":"2086904543"},"language":"en","primary_location":{"id":"doi:10.3115/1117794.1117809","is_oa":true,"landing_page_url":"https://doi.org/10.3115/1117794.1117809","pdf_url":"https://dl.acm.org/doi/pdf/10.3115/1117794.1117809","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2000 Joint SIGDAT conference on Empirical methods in natural language processing and very large corpora held in conjunction with the 38th Annual Meeting of the Association for Computational Linguistics -","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.3115/1117794.1117809","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5008441752","display_name":"Kyoji Umemura","orcid":"https://orcid.org/0000-0003-0351-1626"},"institutions":[{"id":"https://openalex.org/I136259955","display_name":"Toyohashi University of Technology","ror":"https://ror.org/04ezg6d83","country_code":"JP","type":"education","lineage":["https://openalex.org/I136259955"]}],"countries":["JP"],"is_corresponding":true,"raw_author_name":"Kyoji Umemura","raw_affiliation_strings":["Toyohashi University of Technology, Toyohashi Aichi, Japan","Toyohashi Univ. of Technology, Toyohashi, Aichi (Japan)"],"affiliations":[{"raw_affiliation_string":"Toyohashi University of Technology, Toyohashi Aichi, Japan","institution_ids":["https://openalex.org/I136259955"]},{"raw_affiliation_string":"Toyohashi Univ. of Technology, Toyohashi, Aichi (Japan)","institution_ids":["https://openalex.org/I136259955"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5016543371","display_name":"Kenneth Church","orcid":"https://orcid.org/0000-0001-8378-6069"},"institutions":[{"id":"https://openalex.org/I1283103587","display_name":"AT&T (United States)","ror":"https://ror.org/02bbd5539","country_code":"US","type":"company","lineage":["https://openalex.org/I1283103587"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Kenneth W. Church","raw_affiliation_strings":["AT&T Labs-Research, Florham Park, NJ","AT&T Labs\u2014\u2014Research, Florham Park, NJ#TAB#"],"affiliations":[{"raw_affiliation_string":"AT&T Labs-Research, Florham Park, NJ","institution_ids":["https://openalex.org/I1283103587"]},{"raw_affiliation_string":"AT&T Labs\u2014\u2014Research, Florham Park, NJ#TAB#","institution_ids":["https://openalex.org/I1283103587"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":2,"corresponding_author_ids":["https://openalex.org/A5008441752"],"corresponding_institution_ids":["https://openalex.org/I136259955"],"apc_list":null,"apc_paid":null,"fwci":2.6523,"has_fulltext":true,"cited_by_count":9,"citation_normalized_percentile":{"value":0.91018344,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":89,"max":94},"biblio":{"volume":"13","issue":null,"first_page":"117","last_page":"123"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9955000281333923,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9955000281333923,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9952999949455261,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9929999709129333,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8319549560546875},{"id":"https://openalex.org/keywords/burstiness","display_name":"Burstiness","score":0.7495689392089844},{"id":"https://openalex.org/keywords/tf\u2013idf","display_name":"tf\u2013idf","score":0.7454296350479126},{"id":"https://openalex.org/keywords/term","display_name":"Term (time)","score":0.6904653310775757},{"id":"https://openalex.org/keywords/generalization","display_name":"Generalization","score":0.67042076587677},{"id":"https://openalex.org/keywords/weighting","display_name":"Weighting","score":0.6302354335784912},{"id":"https://openalex.org/keywords/simple","display_name":"Simple (philosophy)","score":0.5291006565093994},{"id":"https://openalex.org/keywords/relevance","display_name":"Relevance (law)","score":0.45645615458488464},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.43421339988708496},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.4219544231891632},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.3412162661552429},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.32562610507011414},{"id":"https://openalex.org/keywords/mathematics","display_name":"Mathematics","score":0.1201593279838562}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8319549560546875},{"id":"https://openalex.org/C2781023610","wikidata":"https://www.wikidata.org/wiki/Q17006304","display_name":"Burstiness","level":3,"score":0.7495689392089844},{"id":"https://openalex.org/C81758059","wikidata":"https://www.wikidata.org/wiki/Q796584","display_name":"tf\u2013idf","level":3,"score":0.7454296350479126},{"id":"https://openalex.org/C61797465","wikidata":"https://www.wikidata.org/wiki/Q1188986","display_name":"Term (time)","level":2,"score":0.6904653310775757},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.67042076587677},{"id":"https://openalex.org/C183115368","wikidata":"https://www.wikidata.org/wiki/Q856577","display_name":"Weighting","level":2,"score":0.6302354335784912},{"id":"https://openalex.org/C2780586882","wikidata":"https://www.wikidata.org/wiki/Q7520643","display_name":"Simple (philosophy)","level":2,"score":0.5291006565093994},{"id":"https://openalex.org/C158154518","wikidata":"https://www.wikidata.org/wiki/Q7310970","display_name":"Relevance (law)","level":2,"score":0.45645615458488464},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.43421339988708496},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.4219544231891632},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3412162661552429},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.32562610507011414},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.1201593279838562},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C126838900","wikidata":"https://www.wikidata.org/wiki/Q77604","display_name":"Radiology","level":1,"score":0.0},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C158379750","wikidata":"https://www.wikidata.org/wiki/Q214111","display_name":"Network packet","level":2,"score":0.0},{"id":"https://openalex.org/C199539241","wikidata":"https://www.wikidata.org/wiki/Q7748","display_name":"Law","level":1,"score":0.0},{"id":"https://openalex.org/C71924100","wikidata":"https://www.wikidata.org/wiki/Q11190","display_name":"Medicine","level":0,"score":0.0},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"id":"doi:10.3115/1117794.1117809","is_oa":true,"landing_page_url":"https://doi.org/10.3115/1117794.1117809","pdf_url":"https://dl.acm.org/doi/pdf/10.3115/1117794.1117809","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2000 Joint SIGDAT conference on Empirical methods in natural language processing and very large corpora held in conjunction with the 38th Annual Meeting of the Association for Computational Linguistics -","raw_type":"proceedings-article"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.119.9850","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.119.9850","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://www.cs.columbia.edu/~sable/research/umemura_church.pdf","raw_type":"text"},{"id":"pmh:oai:CiteSeerX.psu:10.1.1.13.7662","is_oa":false,"landing_page_url":"http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.13.7662","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"http://acl.ldc.upenn.edu/W/W00/W00-1315.pdf","raw_type":"text"}],"best_oa_location":{"id":"doi:10.3115/1117794.1117809","is_oa":true,"landing_page_url":"https://doi.org/10.3115/1117794.1117809","pdf_url":"https://dl.acm.org/doi/pdf/10.3115/1117794.1117809","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 2000 Joint SIGDAT conference on Empirical methods in natural language processing and very large corpora held in conjunction with the 38th Annual Meeting of the Association for Computational Linguistics -","raw_type":"proceedings-article"},"sustainable_development_goals":[{"score":0.6100000143051147,"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure"}],"awards":[],"funders":[{"id":"https://openalex.org/F4320309370","display_name":"University of Pennsylvania","ror":"https://ror.org/00b30xv10"}],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W2086904543.pdf","grobid_xml":"https://content.openalex.org/works/W2086904543.grobid-xml"},"referenced_works_count":7,"referenced_works":["https://openalex.org/W73194202","https://openalex.org/W196743200","https://openalex.org/W1479966022","https://openalex.org/W1525341925","https://openalex.org/W1975690018","https://openalex.org/W2044490427","https://openalex.org/W2113110240"],"related_works":["https://openalex.org/W1843717240","https://openalex.org/W2340440348","https://openalex.org/W2508200300","https://openalex.org/W2017917301","https://openalex.org/W1973723221","https://openalex.org/W4281569536","https://openalex.org/W2171473894","https://openalex.org/W2741230576","https://openalex.org/W2372355498","https://openalex.org/W1441686524"],"abstract_inverted_index":{"We":[0,208],"propose":[1],"an":[2],"empirical":[3],"method":[4,71,78,119],"for":[5,25,174],"estimating":[6],"term":[7,31,148],"weights":[8,28],"directly":[9],"from":[10,160],"relevance":[11,81],"judgments,":[12],"avoiding":[13],"various":[14],"standard":[15],"but":[16,46,101],"potentially":[17],"trouble-some":[18],"assumptions.":[19,69],"It":[20],"is":[21,72,149,216],"common":[22],"to":[23,74,105,124,164,177,218],"assume,":[24],"example,":[26],"that":[27,49,62,211],"vary":[29],"with":[30],"frequency":[32,37,137,156],"(tf)":[33],"and":[34,112,135,220],"inverse":[35],"document":[36],"(idf)":[38],"in":[39,58,168,185,203],"a":[40,86,121,161,201],"particular":[41],"way,":[42],"e.g.,":[43],"tf":[44],".idf,":[45],"the":[47,59,75,169,178,182,186,204,212],"fact":[48],"there":[50,63],"are":[51,83,209],"so":[52],"many":[53],"variants":[54],"of":[55,89,140,146,180,226],"this":[56,224],"formula":[57],"literature":[60],"suggests":[61],"remains":[64],"considerable":[65],"uncertainty":[66],"about":[67],"these":[68],"Our":[70],"similar":[73],"Berkeley":[76],"regression":[77],"where":[79],"labeled":[80],"judgments":[82],"fit":[84],"as":[85,110,131],"linear":[87],"combination":[88],"(transforms":[90],"of)":[91],"tf,":[92,132],"idf,":[93,133],"etc.":[94],"Training":[95],"methods":[96],"not":[97],"only":[98],"improve":[99],"performance,":[100],"also":[102],"extend":[103],"naturally":[104],"include":[106],"additional":[107],"factors":[108,129],"such":[109,130],"burstiness":[111,134],"query":[113,141],"expansion.":[114],"The":[115,143],"proposed":[116,213],"histogram-based":[117],"training":[118],"provides":[120],"simple":[122,217],"way":[123],"model":[125],"complicated":[126],"interactions":[127],"among":[128],"expansion":[136],"(a":[138],"generalization":[139],"expansion).":[142],"correct":[144],"handling":[145],"expanded":[147],"realized":[150],"based":[151],"on":[152],"statistical":[153],"information.":[154],"Expansion":[155],"dramatically":[157],"improves":[158],"performance":[159],"level":[162,179,225],"comparable":[163],"BKJJBIDS,":[165],"Berkeley's":[166],"entry":[167],"Japanese":[170,205],"NACSIS":[171],"NTCIR-1":[172],"evaluation":[173],"short":[175],"queries,":[176],"JCB1,":[181],"top":[183],"system":[184],"evaluation.":[187],"JCB1":[188],"uses":[189],"sophisticated":[190],"(and":[191],"proprietary)":[192],"natural":[193],"language":[194],"processing":[195],"techniques":[196],"developed":[197],"by":[198],"Just":[199],"System,":[200],"leader":[202],"word-processing":[206],"industry.":[207],"encouraged":[210],"method,":[214],"which":[215],"understand":[219],"replicate,":[221],"can":[222],"reach":[223],"performance.":[227]},"counts_by_year":[{"year":2013,"cited_by_count":1}],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
