{"id":"https://openalex.org/W4386556254","doi":"https://doi.org/10.48550/arxiv.2309.02999","title":"Vote2Cap-DETR++: Decoupling Localization and Describing for End-to-End 3D Dense Captioning","display_name":"Vote2Cap-DETR++: Decoupling Localization and Describing for End-to-End 3D Dense Captioning","publication_year":2023,"publication_date":"2023-09-06","ids":{"openalex":"https://openalex.org/W4386556254","doi":"https://doi.org/10.48550/arxiv.2309.02999"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2309.02999","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2309.02999","pdf_url":"https://arxiv.org/pdf/2309.02999","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2309.02999","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5114804332","display_name":"Sijin Chen","orcid":"https://orcid.org/0009-0008-1319-746X"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Chen, Sijin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102483130","display_name":"Hongyuan Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Hongyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101698381","display_name":"Mingsheng Li","orcid":"https://orcid.org/0009-0004-3758-0018"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Mingsheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100624798","display_name":"Xin Chen","orcid":"https://orcid.org/0009-0005-0200-2493"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Xin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114804035","display_name":"Peng Guo","orcid":"https://orcid.org/0009-0006-9848-2272"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Peng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102831936","display_name":"Yinjie Lei","orcid":"https://orcid.org/0000-0001-6856-3342"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lei, Yinjie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5003400275","display_name":"Gang Yu","orcid":"https://orcid.org/0000-0001-5570-2710"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Gang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021697903","display_name":"Taihao Li","orcid":"https://orcid.org/0000-0003-3279-7125"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Taihao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5075177714","display_name":"Tao Chen","orcid":"https://orcid.org/0000-0001-8239-1698"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Tao","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5114804332"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9995999932289124,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.998199999332428,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.996999979019165,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.830899715423584},{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.8045675754547119},{"id":"https://openalex.org/keywords/decoding-methods","display_name":"Decoding methods","score":0.6146054267883301},{"id":"https://openalex.org/keywords/pipeline","display_name":"Pipeline (software)","score":0.5752539038658142},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.4670138955116272},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4498322904109955},{"id":"https://openalex.org/keywords/data-mining","display_name":"Data mining","score":0.39986932277679443},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.32926154136657715},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.32407912611961365},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.14974665641784668},{"id":"https://openalex.org/keywords/algorithm","display_name":"Algorithm","score":0.11859157681465149}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.830899715423584},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.8045675754547119},{"id":"https://openalex.org/C57273362","wikidata":"https://www.wikidata.org/wiki/Q576722","display_name":"Decoding methods","level":2,"score":0.6146054267883301},{"id":"https://openalex.org/C43521106","wikidata":"https://www.wikidata.org/wiki/Q2165493","display_name":"Pipeline (software)","level":2,"score":0.5752539038658142},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.4670138955116272},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4498322904109955},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.39986932277679443},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.32926154136657715},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32407912611961365},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.14974665641784668},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.11859157681465149},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2309.02999","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2309.02999","pdf_url":"https://arxiv.org/pdf/2309.02999","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2309.02999","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2309.02999","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2309.02999","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2309.02999","pdf_url":"https://arxiv.org/pdf/2309.02999","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/16","score":0.41999998688697815,"display_name":"Peace, Justice and strong institutions"}],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4386556254.pdf","grobid_xml":"https://content.openalex.org/works/W4386556254.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W2775506363","https://openalex.org/W3088136942","https://openalex.org/W4290852288","https://openalex.org/W2949362007","https://openalex.org/W4388893791","https://openalex.org/W4283207562","https://openalex.org/W2963177403","https://openalex.org/W2330246314","https://openalex.org/W2949522393"],"abstract_inverted_index":{"3D":[0,13,37,66],"dense":[1],"captioning":[2],"requires":[3],"a":[4,26,36,75,113,197],"model":[5],"to":[6,54,118,138,150,166],"translate":[7],"its":[8],"understanding":[9],"of":[10,58,84,105,116],"an":[11,125],"input":[12],"scene":[14,106],"into":[15,133],"several":[16],"captions":[17],"associated":[18],"with":[19,39],"different":[20,103],"object":[21,88,97],"regions.":[22],"Existing":[23],"methods":[24,45,195],"adopt":[25],"sophisticated":[27],"\"detect-then-describe\"":[28,194],"pipeline,":[29],"which":[30,108,129],"builds":[31],"explicit":[32],"relation":[33],"modules":[34],"upon":[35],"detector":[38],"numerous":[40],"hand-crafted":[41],"components.":[42],"While":[43],"these":[44],"have":[46],"achieved":[47],"initial":[48],"success,":[49],"the":[50,81,131,145,167],"cascade":[51],"pipeline":[52],"tends":[53],"accumulate":[55],"errors":[56],"because":[57],"duplicated":[59],"and":[60,64,87,99,135,156,176,186,190],"inaccurate":[61],"box":[62],"estimations":[63],"messy":[65],"scenes.":[67],"In":[68],"this":[69,121],"paper,":[70],"we":[71,94,123,143],"first":[72],"propose":[73,124],"Vote2Cap-DETR,":[74],"simple-yet-effective":[76],"transformer":[77],"framework":[78],"that":[79,96],"decouples":[80,130],"decoding":[82],"process":[83],"caption":[85,136,168],"generation":[86,101],"localization":[89,98,134,158],"through":[90],"parallel":[91],"decoding.":[92],"Moreover,":[93],"argue":[95],"description":[100],"require":[102],"levels":[104],"understanding,":[107],"could":[109],"be":[110,202],"challenging":[111],"for":[112,153,170],"shared":[114],"set":[115],"queries":[117,132,137,152],"capture.":[119],"To":[120],"end,":[122],"advanced":[126],"version,":[127],"Vote2Cap-DETR++,":[128],"capture":[139],"task-specific":[140],"features.":[141],"Additionally,":[142],"introduce":[144],"iterative":[146],"spatial":[147,164],"refinement":[148],"strategy":[149],"vote":[151],"faster":[154],"convergence":[155],"better":[157],"performance.":[159],"We":[160],"also":[161],"insert":[162],"additional":[163],"information":[165],"head":[169],"more":[171],"accurate":[172],"descriptions.":[173],"Without":[174],"bells":[175],"whistles,":[177],"extensive":[178],"experiments":[179],"on":[180],"two":[181],"commonly":[182],"used":[183],"datasets,":[184],"ScanRefer":[185],"Nr3D,":[187],"demonstrate":[188],"Vote2Cap-DETR":[189],"Vote2Cap-DETR++":[191],"surpass":[192],"conventional":[193],"by":[196],"large":[198],"margin.":[199],"Codes":[200],"will":[201],"made":[203],"available":[204],"at":[205],"https://github.com/ch3cook-fdu/Vote2Cap-DETR.":[206]},"counts_by_year":[],"updated_date":"2026-02-09T09:26:11.010843","created_date":"2023-09-09T00:00:00"}
