{"id":"https://openalex.org/W4416034300","doi":"https://doi.org/10.18653/v1/2025.findings-emnlp.1123","title":"QA\u2010LIGN: Aligning LLMs through Constitutionally Decomposed QA","display_name":"QA\u2010LIGN: Aligning LLMs through Constitutionally Decomposed QA","publication_year":2025,"publication_date":"2025-01-01","ids":{"openalex":"https://openalex.org/W4416034300","doi":"https://doi.org/10.18653/v1/2025.findings-emnlp.1123"},"language":null,"primary_location":{"id":"doi:10.18653/v1/2025.findings-emnlp.1123","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.1123","pdf_url":"https://aclanthology.org/2025.findings-emnlp.1123.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"},"type":"preprint","indexed_in":["arxiv","crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2025.findings-emnlp.1123.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5013239627","display_name":"Jacob Dineen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jacob Dineen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5098921482","display_name":"Aswin Rrv","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Aswin Rrv","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120308546","display_name":"Qin Liu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qin Liu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100874726","display_name":"Zhikun Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhikun Xu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112599805","display_name":"Xiao Ye","orcid":"https://orcid.org/0009-0009-0947-8311"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiao Ye","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113833982","display_name":"Ming Shen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ming Shen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101786171","display_name":"Zhaonan Li","orcid":"https://orcid.org/0000-0002-5928-1084"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhaonan Li","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101054554","display_name":"Shijie Lu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shijie Lu","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083735830","display_name":"Chitta Baral","orcid":"https://orcid.org/0000-0002-7549-723X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chitta Baral","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5120308545","display_name":"Muhao Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Muhao Chen","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5067460538","display_name":"Ben Zhou","orcid":"https://orcid.org/0000-0001-7177-2882"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ben Zhou","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":11,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.15483241,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"20619","last_page":"20642"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.4966000020503998,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.4966000020503998,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.17180000245571136,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.07100000232458115,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/transparency","display_name":"Transparency (behavior)","score":0.7002000212669373},{"id":"https://openalex.org/keywords/rubric","display_name":"Rubric","score":0.6150000095367432},{"id":"https://openalex.org/keywords/modular-design","display_name":"Modular design","score":0.47780001163482666},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.44600000977516174},{"id":"https://openalex.org/keywords/pareto-optimal","display_name":"Pareto optimal","score":0.44339999556541443},{"id":"https://openalex.org/keywords/quality","display_name":"Quality (philosophy)","score":0.44190001487731934},{"id":"https://openalex.org/keywords/natural-language","display_name":"Natural language","score":0.4262000024318695},{"id":"https://openalex.org/keywords/pareto-principle","display_name":"Pareto principle","score":0.42260000109672546}],"concepts":[{"id":"https://openalex.org/C2780233690","wikidata":"https://www.wikidata.org/wiki/Q535347","display_name":"Transparency (behavior)","level":2,"score":0.7002000212669373},{"id":"https://openalex.org/C111640148","wikidata":"https://www.wikidata.org/wiki/Q847349","display_name":"Rubric","level":2,"score":0.6150000095367432},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.609499990940094},{"id":"https://openalex.org/C101468663","wikidata":"https://www.wikidata.org/wiki/Q1620158","display_name":"Modular design","level":2,"score":0.47780001163482666},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.44600000977516174},{"id":"https://openalex.org/C2986314615","wikidata":"https://www.wikidata.org/wiki/Q36829","display_name":"Pareto optimal","level":3,"score":0.44339999556541443},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.44190001487731934},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4336000084877014},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.4262000024318695},{"id":"https://openalex.org/C137635306","wikidata":"https://www.wikidata.org/wiki/Q182667","display_name":"Pareto principle","level":2,"score":0.42260000109672546},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.39169999957084656},{"id":"https://openalex.org/C59577422","wikidata":"https://www.wikidata.org/wiki/Q10265143","display_name":"False accusation","level":2,"score":0.2913999855518341},{"id":"https://openalex.org/C2776187449","wikidata":"https://www.wikidata.org/wiki/Q1513879","display_name":"Natural language generation","level":3,"score":0.29089999198913574},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.28929999470710754},{"id":"https://openalex.org/C162118730","wikidata":"https://www.wikidata.org/wiki/Q1128453","display_name":"Actuarial science","level":1,"score":0.28610000014305115},{"id":"https://openalex.org/C190253527","wikidata":"https://www.wikidata.org/wiki/Q295354","display_name":"Law and economics","level":1,"score":0.2833000123500824},{"id":"https://openalex.org/C2776608160","wikidata":"https://www.wikidata.org/wiki/Q4785462","display_name":"Natural (archaeology)","level":2,"score":0.2806999981403351},{"id":"https://openalex.org/C134537474","wikidata":"https://www.wikidata.org/wiki/Q17144832","display_name":"Naturalness","level":2,"score":0.28040000796318054},{"id":"https://openalex.org/C112930515","wikidata":"https://www.wikidata.org/wiki/Q4389547","display_name":"Risk analysis (engineering)","level":1,"score":0.2777000069618225},{"id":"https://openalex.org/C144133560","wikidata":"https://www.wikidata.org/wiki/Q4830453","display_name":"Business","level":0,"score":0.2712000012397766},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.2655999958515167},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.26109999418258667},{"id":"https://openalex.org/C39549134","wikidata":"https://www.wikidata.org/wiki/Q133080","display_name":"Public relations","level":1,"score":0.257999986410141},{"id":"https://openalex.org/C84976871","wikidata":"https://www.wikidata.org/wiki/Q2015673","display_name":"Openness to experience","level":2,"score":0.2565999925136566},{"id":"https://openalex.org/C2778143727","wikidata":"https://www.wikidata.org/wiki/Q1820650","display_name":"Readability","level":2,"score":0.25589999556541443},{"id":"https://openalex.org/C42475967","wikidata":"https://www.wikidata.org/wiki/Q194292","display_name":"Operations research","level":1,"score":0.2529999911785126},{"id":"https://openalex.org/C77618280","wikidata":"https://www.wikidata.org/wiki/Q1155772","display_name":"Scheme (mathematics)","level":2,"score":0.2502000033855438}],"mesh":[],"locations_count":2,"locations":[{"id":"doi:10.18653/v1/2025.findings-emnlp.1123","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.1123","pdf_url":"https://aclanthology.org/2025.findings-emnlp.1123.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"},{"id":"pmh:oai:arXiv.org:2506.08123","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2506.08123","pdf_url":"https://arxiv.org/pdf/2506.08123","source":{"id":"https://openalex.org/S4393918464","display_name":"ArXiv.org","issn_l":"2331-8422","issn":["2331-8422"],"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":null,"raw_type":"text"}],"best_oa_location":{"id":"doi:10.18653/v1/2025.findings-emnlp.1123","is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2025.findings-emnlp.1123","pdf_url":"https://aclanthology.org/2025.findings-emnlp.1123.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Findings of the Association for Computational Linguistics: EMNLP 2025","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4416034300.pdf","grobid_xml":"https://content.openalex.org/works/W4416034300.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":[],"abstract_inverted_index":{"Alignment":[0],"of":[1],"large":[2],"language":[3,39],"models":[4,99],"(LLMs)":[5],"with":[6,96],"principles":[7],"like":[8],"helpfulness,":[9],"honesty,":[10],"and":[11,46,61,90,94,110],"harmlessness":[12],"typically":[13],"relies":[14],"on":[15],"scalar":[16],"rewards":[17,31],"that":[18,105],"obscure":[19],"which":[20,28],"objectives":[21],"drive":[22],"the":[23,53],"training":[24],"signal.We":[25],"introduce":[26],"QA-LIGN,":[27],"decomposes":[29],"monolithic":[30],"into":[32],"interpretable":[33,109],"principle-specific":[34],"evaluations":[35],"through":[36,42],"structured":[37],"natural":[38],"programs.Models":[40],"learn":[41],"a":[43,80],"draft,":[44],"critique,":[45],"revise":[47],"pipeline,":[48],"where":[49],"symbolic":[50],"evaluation":[51],"against":[52],"rubrics":[54],"provides":[55],"transparent":[56],"feedback":[57],"for":[58],"both":[59,92],"initial":[60],"revised":[62],"responses":[63],"during":[64],"GRPO":[65,95],"training.Applied":[66],"to":[67,76],"uncensored":[68],"Llama-3.1-8B-Instruct,QA-LIGN":[69],"reduces":[70],"attack":[71],"success":[72],"rates":[73],"by":[74],"up":[75],"68.7%":[77],"while":[78],"maintaining":[79],"0.67%":[81],"false":[82],"refusal":[83],"rate,":[84],"achieving":[85],"Pareto":[86],"optimal":[87],"safetyhelpfulness":[88],"performance":[89],"outperforming":[91],"DPO":[93],"state-of-the-art":[97],"reward":[98,107],"given":[100],"equivalent":[101],"training.These":[102],"results":[103],"demonstrate":[104],"making":[106],"signals":[108],"modular":[111],"improves":[112],"alignment":[113],"effectiveness,":[114],"suggesting":[115],"transparency":[116],"enhances":[117],"LLM":[118],"safety.":[119]},"counts_by_year":[],"updated_date":"2026-06-21T07:57:09.225873","created_date":"2025-11-08T00:00:00"}
