{"id":"https://openalex.org/W7162643813","doi":"https://doi.org/10.48550/arxiv.2605.28714","title":"IPO-Mine: A Toolkit and Dataset for Section-Structured Analysis of Long, Multimodal IPO Documents","display_name":"IPO-Mine: A Toolkit and Dataset for Section-Structured Analysis of Long, Multimodal IPO Documents","publication_year":2026,"publication_date":"2026-05-27","ids":{"openalex":"https://openalex.org/W7162643813","doi":"https://doi.org/10.48550/arxiv.2605.28714"},"language":null,"primary_location":{"id":"doi:10.48550/arxiv.2605.28714","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28714","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"type":"preprint","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://doi.org/10.48550/arxiv.2605.28714","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5137298911","display_name":"Michael Galarnyk","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Galarnyk, Michael","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064946856","display_name":"Siddharth Lohani","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lohani, Siddharth","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137245852","display_name":"Vidhyakshaya Kannan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kannan, Vidhyakshaya","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5135308283","display_name":"Sagnik Nandi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nandi, Sagnik","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137256874","display_name":"Aman Patel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Patel, Aman","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137262552","display_name":"Liqin Ye","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye, Liqin","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5092579830","display_name":"Arnav Hiray","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hiray, Arnav","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5107099336","display_name":"Rutwik Routu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Routu, Rutwik","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5137262123","display_name":"Prasun Banerjee","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Banerjee, Prasun","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5117596069","display_name":"Siddhartha Somani","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Somani, Siddhartha","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5137208918","display_name":"Sudheer Chava","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chava, Sudheer","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10081","display_name":"Auditing, Earnings Management, Governance","score":0.16179999709129333,"subfield":{"id":"https://openalex.org/subfields/1402","display_name":"Accounting"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10081","display_name":"Auditing, Earnings Management, Governance","score":0.16179999709129333,"subfield":{"id":"https://openalex.org/subfields/1402","display_name":"Accounting"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T13904","display_name":"Artificial Intelligence Applications","score":0.040300000458955765,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13794","display_name":"Financial Reporting and XBRL","score":0.038600001484155655,"subfield":{"id":"https://openalex.org/subfields/1404","display_name":"Management Information Systems"},"field":{"id":"https://openalex.org/fields/14","display_name":"Business, Management and Accounting"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/initial-public-offering","display_name":"Initial public offering","score":0.7024999856948853},{"id":"https://openalex.org/keywords/upload","display_name":"Upload","score":0.6599000096321106},{"id":"https://openalex.org/keywords/workflow","display_name":"Workflow","score":0.5803999900817871},{"id":"https://openalex.org/keywords/parsing","display_name":"Parsing","score":0.5752999782562256},{"id":"https://openalex.org/keywords/benchmark","display_name":"Benchmark (surveying)","score":0.5259000062942505},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.5189999938011169},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.3686000108718872}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7210000157356262},{"id":"https://openalex.org/C12559387","wikidata":"https://www.wikidata.org/wiki/Q185142","display_name":"Initial public offering","level":2,"score":0.7024999856948853},{"id":"https://openalex.org/C71901391","wikidata":"https://www.wikidata.org/wiki/Q7126699","display_name":"Upload","level":2,"score":0.6599000096321106},{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.5803999900817871},{"id":"https://openalex.org/C186644900","wikidata":"https://www.wikidata.org/wiki/Q194152","display_name":"Parsing","level":2,"score":0.5752999782562256},{"id":"https://openalex.org/C185798385","wikidata":"https://www.wikidata.org/wiki/Q1161707","display_name":"Benchmark (surveying)","level":2,"score":0.5259000062942505},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.5189999938011169},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.4140999913215637},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3686000108718872},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.3628999888896942},{"id":"https://openalex.org/C190812933","wikidata":"https://www.wikidata.org/wiki/Q28923","display_name":"Chart","level":2,"score":0.3285999894142151},{"id":"https://openalex.org/C199033989","wikidata":"https://www.wikidata.org/wiki/Q1318295","display_name":"Narrative","level":2,"score":0.32409998774528503},{"id":"https://openalex.org/C148524875","wikidata":"https://www.wikidata.org/wiki/Q6975395","display_name":"F1 score","level":2,"score":0.3091000020503998},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.30149999260902405},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.30000001192092896},{"id":"https://openalex.org/C2778334786","wikidata":"https://www.wikidata.org/wiki/Q1586270","display_name":"Variation (astronomy)","level":2,"score":0.290800005197525},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.2897999882698059},{"id":"https://openalex.org/C171686336","wikidata":"https://www.wikidata.org/wiki/Q3532085","display_name":"Topic model","level":2,"score":0.28439998626708984},{"id":"https://openalex.org/C121955636","wikidata":"https://www.wikidata.org/wiki/Q4116214","display_name":"Accounting","level":1,"score":0.2720000147819519},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.27149999141693115},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.2709999978542328},{"id":"https://openalex.org/C2777264242","wikidata":"https://www.wikidata.org/wiki/Q959950","display_name":"XBRL","level":2,"score":0.2621000111103058}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.48550/arxiv.2605.28714","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28714","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"doi:10.48550/arxiv.2605.28714","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2605.28714","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"article"},"sustainable_development_goals":[{"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9","score":0.6304962038993835}],"awards":[],"funders":[],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"An":[0],"Initial":[1],"Public":[2],"Offering":[3],"(IPO)":[4],"filing":[5],"is":[6,50],"a":[7,11,27,130],"document":[8],"released":[9],"when":[10],"private":[12],"firm":[13],"goes":[14],"public,":[15],"allowing":[16],"individual":[17],"(retail)":[18],"investors":[19],"to":[20,46,145],"purchase":[21],"its":[22],"shares.":[23],"These":[24,67],"filings":[25,60,72,94,140],"describe":[26],"firm's":[28],"business,":[29],"financials,":[30],"and":[31,33,41,64,77,91,99,109,141,147,164,205,210,217],"risks":[32],"are":[34,219],"long,":[35,120,190],"multimodal":[36,65,121,133,172,187],"documents":[37,68],"with":[38,61],"narrative":[39],"text":[40,98],"images.":[42,101,151],"Despite":[43],"their":[44],"importance":[45],"financial":[47,159],"markets,":[48],"there":[49],"no":[51],"large-scale,":[52,115],"standardized":[53,96],"dataset":[54,134],"or":[55],"benchmark":[56],"for":[57,89],"studying":[58],"IPO":[59,93,139],"modern":[62],"language":[63],"models.":[66],"pose":[69],"significant":[70],"challenges:":[71],"frequently":[73],"exceed":[74],"500,000":[75],"tokens":[76],"lack":[78],"consistent":[79],"structural":[80],"organization.":[81],"We":[82,152],"introduce":[83],"the":[84,128,196],"IPO-Toolkit,":[85],"an":[86],"open-source":[87],"framework":[88],"downloading":[90],"parsing":[92],"into":[95],"section-structured":[97],"extracted":[100,158],"The":[102],"toolkit":[103],"segments":[104],"filings,":[105],"extracts":[106],"embedded":[107],"images,":[108],"produces":[110],"structured":[111,154],"outputs":[112],"that":[113,170],"enable":[114],"reproducible":[116],"analysis":[117,200],"workflows":[118],"over":[119,149,157,189],"documents.":[122,193],"Using":[123],"this":[124],"infrastructure,":[125],"we":[126],"construct":[127],"IPO-Dataset,":[129],"large,":[131],"section-structured,":[132],"covering":[135],"more":[136],"than":[137],"109,000":[138],"amendments":[142],"from":[143,176],"1994":[144],"2026":[146],"containing":[148],"76,000":[150],"establish":[153],"evaluation":[155],"tasks":[156],"charts,":[160],"including":[161],"chart":[162],"quality":[163],"misleadingness":[165],"assessment.":[166],"Our":[167,214],"experiments":[168],"show":[169],"state-of-the-art":[171],"models":[173],"often":[174],"diverge":[175],"expert":[177],"human":[178],"judgments":[179],"on":[180],"these":[181],"tasks,":[182],"exposing":[183],"alignment":[184],"challenges":[185],"in":[186,208],"reasoning":[188],"real-world":[191],"regulatory":[192],"Beyond":[194],"benchmarking,":[195],"IPO-Dataset":[197],"enables":[198],"large-scale":[199],"of":[201],"section-level":[202],"textual":[203,211],"variation":[204],"cross-industry":[206],"differences":[207],"visual":[209],"disclosure":[212],"practices.":[213],"code,":[215],"dataset,":[216],"website":[218],"publicly":[220],"available":[221],"under":[222],"CC-BY-4.0.":[223]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2026-05-29T00:00:00"}
