{"id":"https://openalex.org/W4410636469","doi":"https://doi.org/10.1145/3701716.3717655","title":"Enhancing Text-Based Person Retrieval via Loss Balancing and Vision-Language Models","display_name":"Enhancing Text-Based Person Retrieval via Loss Balancing and Vision-Language Models","publication_year":2025,"publication_date":"2025-05-08","ids":{"openalex":"https://openalex.org/W4410636469","doi":"https://doi.org/10.1145/3701716.3717655"},"language":"en","primary_location":{"id":"doi:10.1145/3701716.3717655","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3701716.3717655","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3701716.3717655","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://dl.acm.org/doi/pdf/10.1145/3701716.3717655","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010031730","display_name":"Jizheng Zhang","orcid":"https://orcid.org/0000-0003-2062-725X"},"institutions":[{"id":"https://openalex.org/I194450716","display_name":"Jilin University","ror":"https://ror.org/00js3aw79","country_code":"CN","type":"education","lineage":["https://openalex.org/I194450716"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Jizheng Zhang","raw_affiliation_strings":["ZhCheng, College of Software Engineering, Jilin University, Changchun, Jilin, China"],"affiliations":[{"raw_affiliation_string":"ZhCheng, College of Software Engineering, Jilin University, Changchun, Jilin, China","institution_ids":["https://openalex.org/I194450716"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":["https://openalex.org/A5010031730"],"corresponding_institution_ids":["https://openalex.org/I194450716"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.09505052,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"1586","last_page":"1588"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10331","display_name":"Video Surveillance and Tracking Methods","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9976000189781189,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9832000136375427,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7780337929725647},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5394668579101562},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5313005447387695},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4740443229675293},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.35707369446754456},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.35066652297973633}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7780337929725647},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5394668579101562},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5313005447387695},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4740443229675293},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.35707369446754456},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.35066652297973633}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1145/3701716.3717655","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3701716.3717655","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3701716.3717655","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.1145/3701716.3717655","is_oa":true,"landing_page_url":"https://doi.org/10.1145/3701716.3717655","pdf_url":"https://dl.acm.org/doi/pdf/10.1145/3701716.3717655","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Companion Proceedings of the ACM on Web Conference 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4410636469.pdf","grobid_xml":"https://content.openalex.org/works/W4410636469.grobid-xml"},"referenced_works_count":5,"referenced_works":["https://openalex.org/W1949591461","https://openalex.org/W2896632102","https://openalex.org/W2963449390","https://openalex.org/W3165835426","https://openalex.org/W4236965008"],"related_works":["https://openalex.org/W2772917594","https://openalex.org/W2036807459","https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"This":[0],"paper":[1],"presents":[2],"the":[3,8,92,110,116,128],"ZhCheng":[4],"team's":[5],"solution":[6],"to":[7,22,85],"WWW":[9],"2025":[10],"challenge":[11,32],"on":[12,59,115,133],"Text-based":[13,18],"Person":[14],"Anomaly":[15],"Search":[16],"(TPAS).":[17],"person":[19],"retrieval":[20],"aims":[21],"retrieve":[23],"specific":[24],"individuals":[25],"using":[26],"natural":[27],"language":[28,102],"descriptions.":[29],"A":[30],"key":[31],"in":[33,40,100],"this":[34,60,134],"field":[35],"is":[36],"recognizing":[37],"anomalous":[38],"behaviors":[39],"real-world":[41],"scenarios,":[42],"such":[43],"as":[44,109],"unusual":[45],"actions,":[46],"rather":[47],"than":[48],"common":[49],"ones":[50],"like":[51],"walking":[52],"or":[53],"standing.":[54],"Existing":[55],"state-of-the-art":[56],"methods":[57],"underperform":[58],"new":[61],"task.":[62,136],"After":[63],"experiments,":[64],"we":[65,77,104],"identified":[66],"two":[67],"aspects-loss":[68],"functions":[69],"and":[70,112],"architecture-that":[71],"can":[72],"significantly":[73],"enhance":[74],"performance.":[75],"First,":[76],"introduce":[78],"a":[79,106],"strategy":[80],"that":[81,122],"leverages":[82],"homoscedastic":[83],"uncertainty":[84],"balance":[86],"multiple":[87],"loss":[88],"functions.":[89],"Second,":[90],"considering":[91],"remarkable":[93],"success":[94],"of":[95,130],"pretrained":[96,107],"vision-language":[97],"models":[98],"(VLMs)":[99],"vision":[101],"tasks,":[103],"adopt":[105],"VLM":[108],"backbone":[111],"fine-tune":[113],"it":[114],"existing":[117],"benchmark.":[118],"Experimental":[119],"results":[120],"demonstrate":[121],"our":[123],"proposed":[124],"approach":[125],"effectively":[126],"improves":[127],"performance":[129],"current":[131],"baselines":[132],"challenging":[135]},"counts_by_year":[],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-10-10T00:00:00"}
