{"id":"https://openalex.org/W4304080540","doi":"https://doi.org/10.1145/3503161.3547787","title":"You Can even Annotate Text with Voice: Transcription-only-Supervised Text Spotting","display_name":"You Can even Annotate Text with Voice: Transcription-only-Supervised Text Spotting","publication_year":2022,"publication_date":"2022-10-10","ids":{"openalex":"https://openalex.org/W4304080540","doi":"https://doi.org/10.1145/3503161.3547787"},"language":"en","primary_location":{"id":"doi:10.1145/3503161.3547787","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3503161.3547787","pdf_url":null,"source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://discovery.ucl.ac.uk/10153010/1/P36__Tang__2022__ACMMM.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100681719","display_name":"Jingqun Tang","orcid":"https://orcid.org/0000-0003-2577-0119"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Jingqun Tang","raw_affiliation_strings":["Ant Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Ant Group, Hangzhou, China","institution_ids":[]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045593464","display_name":"Qiao Su","orcid":"https://orcid.org/0000-0001-5626-1558"},"institutions":[{"id":"https://openalex.org/I75059550","display_name":"Zhejiang Gongshang University","ror":"https://ror.org/0569mkk41","country_code":"CN","type":"education","lineage":["https://openalex.org/I75059550"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Su Qiao","raw_affiliation_strings":["Zhejiang Gongshang University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang Gongshang University, Hangzhou, China","institution_ids":["https://openalex.org/I75059550"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030151061","display_name":"Benlei Cui","orcid":null},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Benlei Cui","raw_affiliation_strings":["Alibaba Group, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Hangzhou, China","institution_ids":["https://openalex.org/I45928872"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5052491096","display_name":"Yuhang Ma","orcid":"https://orcid.org/0000-0002-4734-1019"},"institutions":[{"id":"https://openalex.org/I45129253","display_name":"University College London","ror":"https://ror.org/02jx3x895","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I45129253"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Yuhang Ma","raw_affiliation_strings":["University College London, London, United Kingdom"],"affiliations":[{"raw_affiliation_string":"University College London, London, United Kingdom","institution_ids":["https://openalex.org/I45129253"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5021142269","display_name":"Sheng Zhang","orcid":"https://orcid.org/0000-0001-5599-6261"},"institutions":[{"id":"https://openalex.org/I76130692","display_name":"Zhejiang University","ror":"https://ror.org/00a2xv884","country_code":"CN","type":"education","lineage":["https://openalex.org/I76130692"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Sheng Zhang","raw_affiliation_strings":["Zhejiang University, Hangzhou, China"],"affiliations":[{"raw_affiliation_string":"Zhejiang University, Hangzhou, China","institution_ids":["https://openalex.org/I76130692"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5048122691","display_name":"Dimitrios Kanoulas","orcid":"https://orcid.org/0000-0002-3684-1472"},"institutions":[{"id":"https://openalex.org/I45129253","display_name":"University College London","ror":"https://ror.org/02jx3x895","country_code":"GB","type":"education","lineage":["https://openalex.org/I124357947","https://openalex.org/I45129253"]}],"countries":["GB"],"is_corresponding":false,"raw_author_name":"Dimitrios Kanoulas","raw_affiliation_strings":["University College London, London, United Kingdom"],"affiliations":[{"raw_affiliation_string":"University College London, London, United Kingdom","institution_ids":["https://openalex.org/I45129253"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":6,"corresponding_author_ids":["https://openalex.org/A5100681719"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.1938,"has_fulltext":true,"cited_by_count":20,"citation_normalized_percentile":{"value":0.85954578,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":97,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"4154","last_page":"4163"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9887999892234802,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9882000088691711,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spotting","display_name":"Spotting","score":0.9340761303901672},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8585617542266846},{"id":"https://openalex.org/keywords/annotation","display_name":"Annotation","score":0.6830949783325195},{"id":"https://openalex.org/keywords/keyword-spotting","display_name":"Keyword spotting","score":0.5593274235725403},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.5510361194610596},{"id":"https://openalex.org/keywords/transcription","display_name":"Transcription (linguistics)","score":0.5398916006088257},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5054903030395508},{"id":"https://openalex.org/keywords/information-retrieval","display_name":"Information retrieval","score":0.4795812666416168}],"concepts":[{"id":"https://openalex.org/C2779506182","wikidata":"https://www.wikidata.org/wiki/Q7580141","display_name":"Spotting","level":2,"score":0.9340761303901672},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8585617542266846},{"id":"https://openalex.org/C2776321320","wikidata":"https://www.wikidata.org/wiki/Q857525","display_name":"Annotation","level":2,"score":0.6830949783325195},{"id":"https://openalex.org/C2781213101","wikidata":"https://www.wikidata.org/wiki/Q6398558","display_name":"Keyword spotting","level":2,"score":0.5593274235725403},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5510361194610596},{"id":"https://openalex.org/C179926584","wikidata":"https://www.wikidata.org/wiki/Q207714","display_name":"Transcription (linguistics)","level":2,"score":0.5398916006088257},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5054903030395508},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.4795812666416168},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1145/3503161.3547787","is_oa":false,"landing_page_url":"https://doi.org/10.1145/3503161.3547787","pdf_url":null,"source":{"id":"https://openalex.org/S4363608757","display_name":"Proceedings of the 30th ACM International Conference on Multimedia","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 30th ACM International Conference on Multimedia","raw_type":"proceedings-article"},{"id":"pmh:oai:eprints.ucl.ac.uk.OAI2:10153010","is_oa":true,"landing_page_url":"https://discovery.ucl.ac.uk/id/eprint/10153010/","pdf_url":"https://discovery.ucl.ac.uk/10153010/1/P36__Tang__2022__ACMMM.pdf","source":{"id":"https://openalex.org/S4306400024","display_name":"UCL Discovery (University College London)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I45129253","host_organization_name":"University College London","host_organization_lineage":["https://openalex.org/I45129253"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"     In:   (Proceedings) 30th ACM International Conference on Multimedia.   ACM Press: Lisbon, Portugal. (2022)    (In press).  ","raw_type":"Proceedings paper"}],"best_oa_location":{"id":"pmh:oai:eprints.ucl.ac.uk.OAI2:10153010","is_oa":true,"landing_page_url":"https://discovery.ucl.ac.uk/id/eprint/10153010/","pdf_url":"https://discovery.ucl.ac.uk/10153010/1/P36__Tang__2022__ACMMM.pdf","source":{"id":"https://openalex.org/S4306400024","display_name":"UCL Discovery (University College London)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I45129253","host_organization_name":"University College London","host_organization_lineage":["https://openalex.org/I45129253"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"     In:   (Proceedings) 30th ACM International Conference on Multimedia.   ACM Press: Lisbon, Portugal. (2022)    (In press).  ","raw_type":"Proceedings paper"},"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/4","score":0.5899999737739563,"display_name":"Quality Education"}],"awards":[],"funders":[],"has_content":{"grobid_xml":false,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4304080540.pdf"},"referenced_works_count":30,"referenced_works":["https://openalex.org/W2194775991","https://openalex.org/W2601564443","https://openalex.org/W2766895242","https://openalex.org/W2875814315","https://openalex.org/W2962966271","https://openalex.org/W2962986948","https://openalex.org/W2964296749","https://openalex.org/W2981969038","https://openalex.org/W2983626510","https://openalex.org/W2988098900","https://openalex.org/W2989176720","https://openalex.org/W2996956254","https://openalex.org/W2997371611","https://openalex.org/W3003868038","https://openalex.org/W3096609285","https://openalex.org/W3110398855","https://openalex.org/W3159307593","https://openalex.org/W3181016597","https://openalex.org/W3187369021","https://openalex.org/W3196976036","https://openalex.org/W3205994442","https://openalex.org/W3207004259","https://openalex.org/W3207083256","https://openalex.org/W4226379819","https://openalex.org/W4286696412","https://openalex.org/W4287854430","https://openalex.org/W4296586882","https://openalex.org/W4312351507","https://openalex.org/W4312593844","https://openalex.org/W6681452975"],"related_works":["https://openalex.org/W2918559346","https://openalex.org/W2114097550","https://openalex.org/W4286904253","https://openalex.org/W3119978414","https://openalex.org/W2516975559","https://openalex.org/W3206647229","https://openalex.org/W1969408022","https://openalex.org/W2000885660","https://openalex.org/W2545741539","https://openalex.org/W1989658893"],"abstract_inverted_index":{"End-to-end":[0],"scene":[1,38,160,182],"text":[2,25,39,46,70,83,127,135,161,183,202],"spotting":[3,40,136,203],"has":[4],"recently":[5],"gained":[6],"great":[7],"attention":[8,88],"in":[9,159],"the":[10,21,50,67,82,93,97,104,143],"research":[11],"community.":[12],"The":[13,185],"majority":[14],"of":[15,24,69,95,106,200],"existing":[16],"methods":[17],"rely":[18],"heavily":[19],"on":[20,169,179],"location":[22,54,64],"annotations":[23,166],"instances":[26],"(e.g.,":[27],"word-level":[28,30],"boxes,":[29],"masks,":[31],"and":[32,72,156,172,195],"char-level":[33],"boxes).":[34],"We":[35,56],"demonstrate":[36],"that":[37],"can":[41],"be":[42],"accomplished":[43],"solely":[44],"via":[45,66,86,109,137],"transcription,":[47],"significantly":[48],"reducing":[49],"need":[51],"for":[52,123,134,146],"costly":[53],"annotations.":[55],"propose":[57,117],"a":[58,110,118,132,151,189],"query-based":[59],"paradigm":[60],"to":[61,92],"learn":[62],"implicit":[63],"features":[65,76],"interaction":[68],"queries":[71],"image":[73,157],"embeddings.":[74],"These":[75],"are":[77],"then":[78],"made":[79],"explicit":[80],"during":[81],"recognition":[84],"stage":[85],"an":[87],"activation":[89],"map.":[90],"Due":[91],"difficulty":[94],"training":[96],"weakly-supervised":[98],"model":[99,107,193],"from":[100],"scratch,":[101],"we":[102,116,130,175],"address":[103],"issue":[105],"convergence":[108],"circular":[111],"curriculum":[112],"learning":[113],"strategy.":[114],"Additionally,":[115],"coarse-to-fine":[119],"cross-attention":[120],"localization":[121],"mechanism":[122],"more":[124],"precisely":[125],"locating":[126],"instances.":[128],"Notably,":[129],"provide":[131],"solution":[133],"audio":[138],"annotation,":[139],"which":[140],"further":[141],"reduces":[142],"time":[144],"required":[145],"annotation.":[147],"Moreover,":[148],"it":[149],"establishes":[150],"link":[152],"between":[153,192],"audio,":[154],"text,":[155],"modalities":[158],"spotting.":[162],"Using":[163],"only":[164],"transcription":[165],"as":[167],"supervision":[168],"both":[170],"real":[171],"synthetic":[173],"data,":[174],"achieve":[176],"competitive":[177],"results":[178],"several":[180],"popular":[181],"benchmarks.":[184],"proposed":[186],"method":[187],"offers":[188],"reasonable":[190],"trade-off":[191],"accuracy":[194],"annotation":[196],"time,":[197],"allowing":[198],"simplification":[199],"large-scale":[201],"applications.":[204]},"counts_by_year":[{"year":2025,"cited_by_count":11},{"year":2024,"cited_by_count":5},{"year":2023,"cited_by_count":4}],"updated_date":"2026-04-05T17:49:38.594831","created_date":"2025-10-10T00:00:00"}
