{"id":"https://openalex.org/W4389628727","doi":"https://doi.org/10.1109/ivcnz61134.2023.10344297","title":"Sequential Image Storytelling Model Based on Transformer Attention Pooling","display_name":"Sequential Image Storytelling Model Based on Transformer Attention Pooling","publication_year":2023,"publication_date":"2023-11-29","ids":{"openalex":"https://openalex.org/W4389628727","doi":"https://doi.org/10.1109/ivcnz61134.2023.10344297"},"language":"en","primary_location":{"id":"doi:10.1109/ivcnz61134.2023.10344297","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/ivcnz61134.2023.10344297","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 38th International Conference on Image and Vision Computing New Zealand (IVCNZ)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5016068746","display_name":"Zainy M. Malakan","orcid":"https://orcid.org/0000-0002-6980-0992"},"institutions":[{"id":"https://openalex.org/I199693650","display_name":"Umm al-Qura University","ror":"https://ror.org/01xjqrm90","country_code":"SA","type":"education","lineage":["https://openalex.org/I199693650"]},{"id":"https://openalex.org/I177877127","display_name":"University of Western Australia","ror":"https://ror.org/047272k79","country_code":"AU","type":"education","lineage":["https://openalex.org/I177877127"]}],"countries":["AU","SA"],"is_corresponding":true,"raw_author_name":"Zainy M. Malakan","raw_affiliation_strings":["The University of Western Australia,Department of Computer Science and Software Engineering,Perth,Australia","Department of Data Science, Umm Al-Qura University, Makkah, Kingdom of Saudi Arabia","Department of Computer Science and Software Engineering, The University of Western Australia, Perth, Australia"],"affiliations":[{"raw_affiliation_string":"The University of Western Australia,Department of Computer Science and Software Engineering,Perth,Australia","institution_ids":["https://openalex.org/I177877127"]},{"raw_affiliation_string":"Department of Data Science, Umm Al-Qura University, Makkah, Kingdom of Saudi Arabia","institution_ids":["https://openalex.org/I199693650"]},{"raw_affiliation_string":"Department of Computer Science and Software Engineering, The University of Western Australia, Perth, Australia","institution_ids":["https://openalex.org/I177877127"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023496317","display_name":"Ghulam Mubashar Hassan","orcid":"https://orcid.org/0000-0002-6636-8807"},"institutions":[{"id":"https://openalex.org/I177877127","display_name":"University of Western Australia","ror":"https://ror.org/047272k79","country_code":"AU","type":"education","lineage":["https://openalex.org/I177877127"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Ghulam Mubashar Hassan","raw_affiliation_strings":["The University of Western Australia,Department of Computer Science and Software Engineering,Perth,Australia","Department of Computer Science and Software Engineering, The University of Western Australia, Perth, Australia"],"affiliations":[{"raw_affiliation_string":"The University of Western Australia,Department of Computer Science and Software Engineering,Perth,Australia","institution_ids":["https://openalex.org/I177877127"]},{"raw_affiliation_string":"Department of Computer Science and Software Engineering, The University of Western Australia, Perth, Australia","institution_ids":["https://openalex.org/I177877127"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5089986388","display_name":"Ajmal Mian","orcid":"https://orcid.org/0000-0002-5206-3842"},"institutions":[{"id":"https://openalex.org/I177877127","display_name":"University of Western Australia","ror":"https://ror.org/047272k79","country_code":"AU","type":"education","lineage":["https://openalex.org/I177877127"]}],"countries":["AU"],"is_corresponding":false,"raw_author_name":"Ajmal Mian","raw_affiliation_strings":["The University of Western Australia,Department of Computer Science and Software Engineering,Perth,Australia","Department of Computer Science and Software Engineering, The University of Western Australia, Perth, Australia"],"affiliations":[{"raw_affiliation_string":"The University of Western Australia,Department of Computer Science and Software Engineering,Perth,Australia","institution_ids":["https://openalex.org/I177877127"]},{"raw_affiliation_string":"Department of Computer Science and Software Engineering, The University of Western Australia, Perth, Australia","institution_ids":["https://openalex.org/I177877127"]}]}],"institutions":[],"countries_distinct_count":2,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5016068746"],"corresponding_institution_ids":["https://openalex.org/I177877127","https://openalex.org/I199693650"],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.16613581,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":"30","issue":null,"first_page":"1","last_page":"6"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9998999834060669,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9804999828338623,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9704999923706055,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.8814927339553833},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.7930850982666016},{"id":"https://openalex.org/keywords/pooling","display_name":"Pooling","score":0.6624318361282349},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.6247797012329102},{"id":"https://openalex.org/keywords/transformer","display_name":"Transformer","score":0.5904720425605774},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.5435592532157898},{"id":"https://openalex.org/keywords/storytelling","display_name":"Storytelling","score":0.5364428758621216},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.48161429166793823},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4559522271156311},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.34553539752960205},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.3239665627479553},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.1604614555835724},{"id":"https://openalex.org/keywords/narrative","display_name":"Narrative","score":0.13113310933113098}],"concepts":[{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.8814927339553833},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7930850982666016},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.6624318361282349},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6247797012329102},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.5904720425605774},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5435592532157898},{"id":"https://openalex.org/C2776538412","wikidata":"https://www.wikidata.org/wiki/Q989963","display_name":"Storytelling","level":3,"score":0.5364428758621216},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.48161429166793823},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4559522271156311},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.34553539752960205},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.3239665627479553},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.1604614555835724},{"id":"https://openalex.org/C199033989","wikidata":"https://www.wikidata.org/wiki/Q1318295","display_name":"Narrative","level":2,"score":0.13113310933113098},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.1109/ivcnz61134.2023.10344297","is_oa":false,"landing_page_url":"http://dx.doi.org/10.1109/ivcnz61134.2023.10344297","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2023 38th International Conference on Image and Vision Computing New Zealand (IVCNZ)","raw_type":"proceedings-article"},{"id":"pmh:oai:pure.atira.dk:publications/588c8df6-231e-4404-8daf-c7339eec1692","is_oa":false,"landing_page_url":"https://research-repository.uwa.edu.au/en/publications/588c8df6-231e-4404-8daf-c7339eec1692","pdf_url":null,"source":{"id":"https://openalex.org/S4306402523","display_name":"UWA Profiles and Research Repository (University of Western Australia)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I177877127","host_organization_name":"The University of Western Australia","host_organization_lineage":["https://openalex.org/I177877127"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"Malakan , Z M , Hassan , M &amp; Mian , A 2023 , Sequential Image Storytelling Model Based on Transformer Attention Pooling . in D Bailey , A Punchihewa &amp; A Paturkar (eds) , 2023 38th International Conference on Image and Vision Computing New Zealand (IVCNZ) . International Conference Image and Vision Computing New Zealand , IEEE, Institute of Electrical and Electronics Engineers , USA , pp. 1-6 , 38th International Conference on Image and Vision Computing New Zealand , Palmerston North , New Zealand , 29/11/23 . https://doi.org/10.1109/IVCNZ61134.2023.10344297","raw_type":"contributionToPeriodical"}],"best_oa_location":null,"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.7599999904632568}],"awards":[],"funders":[{"id":"https://openalex.org/F4320315885","display_name":"Australian Government","ror":"https://ror.org/0314h5y94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":43,"referenced_works":["https://openalex.org/W1601567445","https://openalex.org/W2183386595","https://openalex.org/W2194775991","https://openalex.org/W2556388456","https://openalex.org/W2559780844","https://openalex.org/W2768287968","https://openalex.org/W2804668597","https://openalex.org/W2924176943","https://openalex.org/W2951390634","https://openalex.org/W2962990649","https://openalex.org/W2963177403","https://openalex.org/W2979739834","https://openalex.org/W2987862245","https://openalex.org/W2990818246","https://openalex.org/W2998106530","https://openalex.org/W3017628311","https://openalex.org/W3034221024","https://openalex.org/W3035392611","https://openalex.org/W3045015651","https://openalex.org/W3049209276","https://openalex.org/W3083587785","https://openalex.org/W3087871082","https://openalex.org/W3126515765","https://openalex.org/W3128339783","https://openalex.org/W3130160131","https://openalex.org/W3174012740","https://openalex.org/W3217340782","https://openalex.org/W4200438403","https://openalex.org/W4213281889","https://openalex.org/W4220790454","https://openalex.org/W4282968790","https://openalex.org/W4288083805","https://openalex.org/W4312463400","https://openalex.org/W4312561350","https://openalex.org/W4312577078","https://openalex.org/W4383750972","https://openalex.org/W4385245566","https://openalex.org/W4386066385","https://openalex.org/W4386075661","https://openalex.org/W6686286932","https://openalex.org/W6739901393","https://openalex.org/W6751888654","https://openalex.org/W6781447924"],"related_works":["https://openalex.org/W4210416330","https://openalex.org/W2775506363","https://openalex.org/W3088136942","https://openalex.org/W4290852288","https://openalex.org/W2949362007","https://openalex.org/W4388893791","https://openalex.org/W4283207562","https://openalex.org/W2963177403","https://openalex.org/W2330246314","https://openalex.org/W2949522393"],"abstract_inverted_index":{"The":[0],"Visual":[1],"Storytelling":[2,83],"Task":[3],"(VST)":[4],"extends":[5],"beyond":[6],"describing":[7],"a":[8,23,78],"single":[9],"image,":[10],"such":[11,27],"as":[12],"image":[13,17],"captioning,":[14],"to":[15,55,63,121],"sequential":[16],"descriptions":[18,28],"in":[19,31,43,49],"the":[20,88,112,123,137,149],"form":[21],"of":[22,66],"coherent":[24],"story.":[25],"However,":[26],"present":[29],"challenges":[30],"handling":[32],"varying":[33],"language":[34,109],"styles,":[35],"relational":[36],"role-modeling,":[37],"consistency,":[38],"and":[39,59,105,118,142],"events":[40],"not":[41],"evident":[42],"individual":[44],"images.":[45,129],"A":[46],"common":[47],"limitation":[48],"existing":[50],"approaches":[51],"is":[52],"their":[53],"inability":[54],"fully":[56],"describe":[57],"relations":[58],"visual":[60],"changes,":[61],"leading":[62],"stories":[64],"devoid":[65],"linguistic":[67],"cohesion":[68],"between":[69,127],"multiple":[70,100],"sentences.":[71],"To":[72],"address":[73],"this":[74],"challenge,":[75],"we":[76],"introduce":[77],"novel":[79],"framework,":[80],"Sequential":[81],"Image":[82],"Model":[84],"(SISM),":[85],"based":[86],"on":[87,136,145,152],"Transformer":[89],"architecture.":[90],"Our":[91,130],"model":[92,132],"contextualizes":[93],"input":[94,128],"images":[95],"by":[96],"dividing":[97],"them":[98,107],"into":[99],"16":[101,103],"\u00d7":[102],"patches":[104],"associates":[106],"with":[108],"content":[110],"using":[111],"encoder-decoder":[113],"technique.":[114],"It":[115],"incorporates":[116],"cross-attention":[117],"attention":[119],"pooling":[120],"identify":[122],"most":[124],"relevant":[125],"relationships":[126],"proposed":[131],"achieves":[133],"state-of-the-art":[134],"performance":[135],"recently":[138],"published":[139],"SSID":[140],"dataset":[141],"performs":[143],"competitively":[144],"VIST":[146],"dataset,":[147],"achieving":[148],"top":[150],"score":[151],"BLEU-1":[153],"metric.":[154]},"counts_by_year":[],"updated_date":"2026-04-04T16:13:02.066488","created_date":"2025-10-10T00:00:00"}
