Publications | Zhiyuan Li (李志远)

2025

A Theory of Learning with Autoregressive Chain of Thought

Nirmit Joshi, Gal Vardi, Adam Block, Surbhi Goel, Zhiyuan Li, Theodor Misiakiewicz, and Nathan Srebro

In Proceedings of the 38th Conference on Learning Theory, COLT 2025

@inproceedings{joshi2025theory,
  author = {Joshi, Nirmit and Vardi, Gal and Block, Adam and Goel, Surbhi and Li, Zhiyuan and Misiakiewicz, Theodor and Srebro, Nathan},
  title = {A Theory of Learning with Autoregressive Chain of Thought},
  booktitle = {Proceedings of the 38th Conference on Learning Theory},
  year = {2025},
}

Structured Preconditioners in Adaptive Optimization: A Unified Analysis

Shuo Xie, Tianhao Wang, Sashank Reddi, Sanjiv Kumar, and Zhiyuan Li

In Proceedings of the 42nd International Conference on Machine Learning, ICML 2025

arXiv Bib

@inproceedings{xie2025structured,
  author = {Xie, Shuo and Wang, Tianhao and Reddi, Sashank and Kumar, Sanjiv and Li, Zhiyuan},
  title = {Structured Preconditioners in Adaptive Optimization: A Unified Analysis},
  booktitle = {Proceedings of the 42nd International Conference on Machine Learning},
  year = {2025},
}

PENCIL: Long Thoughts with Short Memory

Chenxiao Yang, Nathan Srebro, David McAllester, and Zhiyuan Li

In Proceedings of the 42nd International Conference on Machine Learning, ICML 2025

arXiv Bib

@inproceedings{yang2025pencil,
  author = {Yang, Chenxiao and Srebro, Nathan and McAllester, David and Li, Zhiyuan},
  title = {PENCIL: Long Thoughts with Short Memory},
  booktitle = {Proceedings of the 42nd International Conference on Machine Learning},
  year = {2025},
}

Weak-to-Strong Generalization Even in Random Feature Networks, Provably

Marko Medvedev, Kaifeng Lyu, Dingli Yu, Sanjeev Arora, Zhiyuan Li, and Nathan Srebro

In Proceedings of the 42nd International Conference on Machine Learning, ICML 2025

arXiv Bib

@inproceedings{medvedev2025weak,
  author = {Medvedev, Marko and Lyu, Kaifeng and Yu, Dingli and Arora, Sanjeev and Li, Zhiyuan and Srebro, Nathan},
  title = {Weak-to-Strong Generalization Even in Random Feature Networks, Provably},
  booktitle = {Proceedings of the 42nd International Conference on Machine Learning},
  year = {2025},
}

Non-Asymptotic Length Generalization

Thomas Chen, Tengyu Ma, and Zhiyuan Li

In Proceedings of the 42nd International Conference on Machine Learning, ICML 2025

arXiv Bib

@inproceedings{chen2025non,
  author = {Chen, Thomas and Ma, Tengyu and Li, Zhiyuan},
  title = {Non-Asymptotic Length Generalization},
  booktitle = {Proceedings of the 42nd International Conference on Machine Learning},
  year = {2025},
}

Chain-of-Thought Provably Enables Learning the (Otherwise) Unlearnable

Chenxiao Yang, Zhiyuan Li, and David Wipf

In The Thirteenth International Conference on Learning Representations, ICLR 2025

LINK Bib

@inproceedings{yang2025chain,
  author = {Yang, Chenxiao and Li, Zhiyuan and Wipf, David},
  title = {Chain-of-Thought Provably Enables Learning the (Otherwise) Unlearnable},
  booktitle = {The Thirteenth International Conference on Learning Representations},
  year = {2025},
}

A Coefficient Makes SVRG Effective

Yida Yin, Zhiqiu Xu, Zhiyuan Li, Trevor Darrell, and Zhuang Liu

In The Thirteenth International Conference on Learning Representations, ICLR 2025

arXiv Bib

@inproceedings{yin2023coefficient,
  author = {Yin, Yida and Xu, Zhiqiu and Li, Zhiyuan and Darrell, Trevor and Liu, Zhuang},
  title = {A Coefficient Makes SVRG Effective},
  booktitle = {The Thirteenth International Conference on Learning Representations},
  year = {2025},
}

Reasoning with Latent Thoughts: On the Power of Looped Transformers

Nikunj Saunshi, Nishanth Dikkala, Zhiyuan Li, Sanjiv Kumar, and Sashank J Reddi

In The Thirteenth International Conference on Learning Representations, ICLR 2025

arXiv Bib

@inproceedings{saunshi2025reasoning,
  author = {Saunshi, Nikunj and Dikkala, Nishanth and Li, Zhiyuan and Kumar, Sanjiv and Reddi, Sashank J},
  title = {Reasoning with Latent Thoughts: On the Power of Looped Transformers},
  booktitle = {The Thirteenth International Conference on Learning Representations},
  year = {2025},
}

Understanding Warmup-Stable-Decay Learning Rates: A River Valley Loss Landscape Perspective

Kaiyue Wen, Zhiyuan Li, Jason Wang, David Hall, Percy Liang, and Tengyu Ma

In The Thirteenth International Conference on Learning Representations, ICLR 2025

arXiv Bib

@inproceedings{wen2024understanding,
  author = {Wen, Kaiyue and Li, Zhiyuan and Wang, Jason and Hall, David and Liang, Percy and Ma, Tengyu},
  title = {Understanding Warmup-Stable-Decay Learning Rates: A River Valley Loss Landscape Perspective},
  booktitle = {The Thirteenth International Conference on Learning Representations},
  year = {2025},
}

Adam Exploits $\ell_\infty$-geometry of Loss Landscape via Coordinate-wise Adaptivity

Shuo Xie, Mohamad Amin Mohamadi, and Zhiyuan Li

In The Thirteenth International Conference on Learning Representations, ICLR 2025

Spotlight arXiv Bib

2025 ICLR Spotlight

@inproceedings{xie2024adam,
  author = {Xie, Shuo and Mohamadi, Mohamad Amin and Li, Zhiyuan},
  title = {Adam Exploits $\ell_\infty$-geometry of Loss Landscape via Coordinate-wise Adaptivity},
  booktitle = {The Thirteenth International Conference on Learning Representations},
  year = {2025},
}

2024

Implicit Bias of AdamW: $\ell_\infty$-Norm Constrained Optimization

Shuo Xie and Zhiyuan Li

In Proceedings of the 41st International Conference on Machine Learning, ICML 2024

arXiv Bib

@inproceedings{xie2024implicit,
  author = {Xie, Shuo and Li, Zhiyuan},
  title = {Implicit Bias of AdamW: $\ell_\infty$-Norm Constrained Optimization},
  booktitle = {Proceedings of the 41st International Conference on Machine Learning},
  pages = {54488--54510},
  year = {2024},
  organization = {PMLR},
}

Why Do You Grok? A Theoretical Analysis of Grokking Modular Addition

Mohamad Amin Mohamadi, Zhiyuan Li, Lei Wu, and Danica J Sutherland

In Proceedings of the 41st International Conference on Machine Learning, ICML 2024

arXiv Bib

@inproceedings{mohamadi2024you,
  author = {Mohamadi, Mohamad Amin and Li, Zhiyuan and Wu, Lei and Sutherland, Danica J},
  title = {Why Do You Grok? A Theoretical Analysis of Grokking Modular Addition},
  booktitle = {Proceedings of the 41st International Conference on Machine Learning},
  pages = {54511--54533},
  year = {2024},
  organization = {PMLR},
}

Simplicity Bias via Global Convergence of Sharpness Minimization

Khashayar Gatmiry, Zhiyuan Li, Sashank J Reddi, and Stefanie Jegelka

In International Conference on Machine Learning, ICML 2024

arXiv Bib

@inproceedings{gatmiry2024simplicity,
  author = {Gatmiry, Khashayar and Li, Zhiyuan and Reddi, Sashank J and Jegelka, Stefanie},
  title = {Simplicity Bias via Global Convergence of Sharpness Minimization},
  booktitle = {International Conference on Machine Learning},
  pages = {15102--15129},
  year = {2024},
  organization = {PMLR},
}

Chain of Thought Empowers Transformers to Solve Inherently Serial Problems

Zhiyuan Li, Hong Liu, Denny Zhou, and Tengyu Ma